Linear Regression Model for Predicting Gold Price



In [1]:

    
#Needed Libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import time
from sklearn.metrics import mean_squared_error



In [2]:

    
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4



In [3]:

    
#Source Data Loading

dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfEurousd = pd.read_csv(eurousdDataPath)
dfOil = pd.read_csv(oilDataPath)



In [4]:

    
print (len(dfGold['Gold_Value']))

def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
    dfOneBehind = dfOriginal.copy()
    #dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
    dfOneBehind['Date'] = dfOriginal['Date']
    for i in range(0,len(dfOriginal['Date']),1):
        if(i < len(dfOriginal['Date']) - 1):
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]        
        else:
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
    return dfOneBehind


dfGoldOneBehind = pd.DataFrame(data=None,columns=('Date','Gold_Value'))
dfGoldOneBehind = shiftOneBehind(dfGold,'Gold_Value','Gold_Value')
dfGoldOneBehind.tail()



In [5]:

    
#Shift SP500, NYSE, USIND, EUROUSD, OIL one day behind
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfEurousdOneBehind = shiftOneBehind(dfEurousd, 'EURO/USD_Value', 'EURO/USD_Value')
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
#Verify
print dfSP500OneBehind.tail()
print dfNyseOneBehind.tail()
print dfUsIndOneBehind.tail()
print dfEurousdOneBehind.tail()
print dfOilOneBehind.tail()









    



           Date  SP500_Value
5030  10/7/1994       452.36
5031  10/6/1994       453.52
5032  10/5/1994       454.59
5033  10/4/1994       461.74
5034  10/3/1994       461.74
           Date  NYSE_Value
5030  10/7/1994     2642.69
5031  10/6/1994     2647.24
5032  10/5/1994     2658.87
5033  10/4/1994     2695.67
5034  10/3/1994     2695.67
           Date  USD_Value
5024  10/7/1994    85.8219
5025  10/6/1994    85.7657
5026  10/5/1994    85.9173
5027  10/4/1994    85.9581
5028  10/3/1994    85.9581
           Date  EURO/USD_Value
5104  10/7/1994          1.2424
5105  10/6/1994          1.2432
5106  10/5/1994          1.2382
5107  10/4/1994          1.2328
5108  10/3/1994          1.2328
           Date  Oil_Value
5018  10/7/1994      18.24
5019  10/6/1994      18.01
5020  10/5/1994      17.97
5021  10/4/1994      18.16
5022  10/3/1994      18.16



In [6]:

    
dfMaster = pd.merge(dfGoldOneBehind, dfSP500OneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfNyseOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfUsIndOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfEurousdOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfOilOneBehind, on='Date', how='inner')
dfMaster.head()
#print dfMaster.shape









    Out[6]:






  
    
      
      Date
      Gold_Value
      SP500_Value
      NYSE_Value
      USD_Value
      EURO/USD_Value
      Oil_Value
    
  
  
    
      0
       9/30/2014
       1219.5
       1977.80
       10749.05
       80.9136
       1.27103
       94.53
    
    
      1
       9/29/2014
       1213.8
       1982.85
       10798.88
       80.9983
       1.27777
       95.55
    
    
      2
       9/26/2014
       1213.8
       1965.99
       10722.21
       80.5957
       1.28405
       93.59
    
    
      3
       9/25/2014
       1217.3
       1998.30
       10885.60
       80.4465
       1.28558
       93.60
    
    
      4
       9/24/2014
       1222.0
       1982.77
       10815.42
       80.1793
       1.28396
       91.55



In [7]:

    
dfMaster.tail()









    Out[7]:






  
    
      
      Date
      Gold_Value
      SP500_Value
      NYSE_Value
      USD_Value
      EURO/USD_Value
      Oil_Value
    
  
  
    
      4916
       10/7/1994
       392.00
       452.36
       2642.69
       85.8219
       1.2424
       18.24
    
    
      4917
       10/6/1994
       393.70
       453.52
       2647.24
       85.7657
       1.2432
       18.01
    
    
      4918
       10/5/1994
       392.45
       454.59
       2658.87
       85.9173
       1.2382
       17.97
    
    
      4919
       10/4/1994
       393.20
       461.74
       2695.67
       85.9581
       1.2328
       18.16
    
    
      4920
       10/3/1994
       393.20
       461.74
       2695.67
       85.9581
       1.2328
       18.16



In [8]:

    
print dfMaster.shape



In [9]:

    
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'EURO/USD_Value', 'Oil_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)

displayCorrHeatGraph(corrTable,
    'Correlation Heat Matrix (Gold_Value, SP500_Value, NYSE_Value, USD_Value, EURO/USD_Value, Oil_Value)')









    



                Gold_Value  SP500_Value  NYSE_Value  USD_Value  \
Gold_Value        1.000000     0.461485    0.578967  -0.792106   
SP500_Value       0.461485     1.000000    0.940998  -0.241948   
NYSE_Value        0.578967     0.940998    1.000000  -0.462967   
USD_Value        -0.792106    -0.241948   -0.462967   1.000000   
EURO/USD_Value    0.628074     0.105645    0.350845  -0.945491   
Oil_Value         0.860482     0.625074    0.789134  -0.805489   

                EURO/USD_Value  Oil_Value  
Gold_Value            0.628074   0.860482  
SP500_Value           0.105645   0.625074  
NYSE_Value            0.350845   0.789134  
USD_Value            -0.945491  -0.805489  
EURO/USD_Value        1.000000   0.692399  
Oil_Value             0.692399   1.000000



In [10]:

    
dfCorrGold = corrTable[:1]
dfCorrGold









    Out[10]:






  
    
      
      Gold_Value
      SP500_Value
      NYSE_Value
      USD_Value
      EURO/USD_Value
      Oil_Value
    
  
  
    
      Gold_Value
       1
       0.461485
       0.578967
      -0.792106
       0.628074
       0.860482

p-Values



In [11]:

    
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrGold[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue









    Out[11]:






  
    
      
      Gold_Value
      SP500_Value
      NYSE_Value
      USD_Value
      EURO/USD_Value
      Oil_Value
    
  
  
    
      0
       0
       0
       0
       0
       0
       0

As the p-Values are all zeros, The factors does not correlates by chance.

Model 1.0: Multivariate Linear Regression



In [12]:

    
import statsmodels.api as sm



In [13]:

    
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]



In [14]:

    
#[2]USD [3]EURO/USD [4]OIL
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])



In [15]:

    
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)



In [16]:

    
res = mvRegress(yArrTrain, xArrTrain)



In [17]:

    
res.summary()









    Out[17]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.709 


  Model:                    OLS          Adj. R-squared:        0.708 


  Method:              Least Squares     F-statistic:           2390. 


  Date:              Sun, 02 Nov 2014    Prob (F-statistic):     0.00  


  Time:                  18:31:32        Log-Likelihood:      -15527. 


  No. Observations:         2952         AIC:                3.106e+04


  Df Residuals:             2948         BIC:                3.109e+04


  Df Model:                    3                                      




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  x1       -19.4471     24.616     -0.790   0.430    -67.713    28.819


  x2        -8.3081      0.383    -21.684   0.000     -9.059    -7.557


  x3         0.0189      0.001     26.340   0.000      0.018     0.020


  const   1048.1661     65.417     16.023   0.000    919.899  1176.434




  Omnibus:        342.707    Durbin-Watson:         0.008 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    514.982 


  Skew:            0.847     Prob(JB):           1.49e-112


  Kurtosis:        4.148     Cond. No.           4.85e+05



In [18]:

    
res.params









    Out[18]:





array([ -1.94470574e+01,  -8.30814665e+00,   1.89479629e-02,
         1.04816608e+03])



In [19]:

    
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
#yPred0



In [20]:

    
yArrTest









    Out[20]:





array([ 1219.5,  1213.8,  1213.8, ...,   594. ,   590. ,   595.1])



In [21]:

    
def futurePredictMLR(res,usdi,eurousd,oil):
    return (res.params[3] + (res.params[2] * usdi) + (res.params[1] * eurousd) + (res.params[0] * oil))

Future Gold Price (By Model 1.0 - Multivariate Linear Regression)

GP(i) = (1048.1661) + ( 0.0189 * usd(i)) + ( -8.3081 * eurousd(i) ) + ( -19.4471 * oil(i) )

Model 1.1: Simple Autoregressive Model based on the last three day prices only.



In [22]:

    
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         ]
xArrTest = np.array(xArrTest)




#print (xArrTrain)            
#print (xArrTrain)            
#print(xArrTrain[0][])



In [23]:

    
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTrain[i][j]=xArrTrain[i][j+i+1]
        else:
            xArrTrain[i][j]=xArrTrain[i][n-1]
            
n=len(xArrTest[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTest[i][j]=xArrTest[i][j+i+1]
        else:
            xArrTest[i][j]=xArrTest[i][n-1]   

print (xArrTrain)
print (xArrTest)









    



[[ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]]
[[ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]]
[[ 574.1  571.4  575.3 ...,  393.2  393.2  393.2]
 [ 571.4  575.3  573.3 ...,  393.2  393.2  393.2]
 [ 575.3  573.3  573.6 ...,  393.2  393.2  393.2]]
[[ 1213.8  1213.8  1217.3 ...,   590.    595.1   595.1]
 [ 1213.8  1217.3  1222.  ...,   595.1   595.1   595.1]
 [ 1217.3  1222.   1213.5 ...,   595.1   595.1   595.1]]



In [24]:

    
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak = np.empty(len(yPred1))
tweak.fill(0.036515358062)
yPred1 = np.add(yPred1,tweak)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])









    



0.00705306597369
1.01014590019
-0.0574113039927
0.0474216676995

Time Series Analysis - Auto Regression Models

Model 1.2: Univariate Autogressive Processes (AR)

Model 1.3: Autogressive Moving-Average Processes (ARMA) and Kalman Filter

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics



In [25]:

    
from sklearn.metrics import mean_squared_error



In [26]:

    
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)

err_y1 = yPred1 - yArrTest
print np.mean(err_y1)

fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err_y1, color='g', bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-1.1")









    



-0.0988469883172






    Out[26]:





<matplotlib.text.Text at 0x7f429a638a50>



In [27]:

    
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1))



In [28]:

    
dfErr









    Out[28]:






  
    
      
      Model
      Minimum % Error
      Maximum % Error
      RMSE Error
      Mean Absolute Error
      Mean Percentage Error
    
  
  
    
      0
                               Model 1.0
       12.658695
       69.811997
       723.910077
       642.599783
       49.591577
    
    
      1
       Model 1.1 (Simple Autoregressive)
        0.001525
       10.104877
        15.519673
        10.568039
        0.905870



In [26]:

	Date	Gold_Value
5212	10/7/1994	392.00
5213	10/6/1994	393.70
5214	10/5/1994	392.45
5215	10/4/1994	393.20
5216	10/3/1994	393.20

	Date	Gold_Value	SP500_Value	NYSE_Value	USD_Value	EURO/USD_Value	Oil_Value
0	9/30/2014	1219.5	1977.80	10749.05	80.9136	1.27103	94.53
1	9/29/2014	1213.8	1982.85	10798.88	80.9983	1.27777	95.55
2	9/26/2014	1213.8	1965.99	10722.21	80.5957	1.28405	93.59
3	9/25/2014	1217.3	1998.30	10885.60	80.4465	1.28558	93.60
4	9/24/2014	1222.0	1982.77	10815.42	80.1793	1.28396	91.55

	Date	Gold_Value	SP500_Value	NYSE_Value	USD_Value	EURO/USD_Value	Oil_Value
4916	10/7/1994	392.00	452.36	2642.69	85.8219	1.2424	18.24
4917	10/6/1994	393.70	453.52	2647.24	85.7657	1.2432	18.01
4918	10/5/1994	392.45	454.59	2658.87	85.9173	1.2382	17.97
4919	10/4/1994	393.20	461.74	2695.67	85.9581	1.2328	18.16
4920	10/3/1994	393.20	461.74	2695.67	85.9581	1.2328	18.16

Dep. Variable:	y	R-squared:	0.709
Model:	OLS	Adj. R-squared:	0.708
Method:	Least Squares	F-statistic:	2390.
Date:	Sun, 02 Nov 2014	Prob (F-statistic):	0.00
Time:	18:31:32	Log-Likelihood:	-15527.
No. Observations:	2952	AIC:	3.106e+04
Df Residuals:	2948	BIC:	3.109e+04
Df Model:	3

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
x1	-19.4471	24.616	-0.790	0.430	-67.713 28.819
x2	-8.3081	0.383	-21.684	0.000	-9.059 -7.557
x3	0.0189	0.001	26.340	0.000	0.018 0.020
const	1048.1661	65.417	16.023	0.000	919.899 1176.434

Omnibus:	342.707	Durbin-Watson:	0.008
Prob(Omnibus):	0.000	Jarque-Bera (JB):	514.982
Skew:	0.847	Prob(JB):	1.49e-112
Kurtosis:	4.148	Cond. No.	4.85e+05

	Model	Minimum % Error	Maximum % Error	RMSE Error	Mean Absolute Error	Mean Percentage Error
0	Model 1.0	12.658695	69.811997	723.910077	642.599783	49.591577
1	Model 1.1 (Simple Autoregressive)	0.001525	10.104877	15.519673	10.568039	0.905870