In [37]:
#Needed Libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import time
from sklearn.metrics import mean_squared_error
In [39]:
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [40]:
#Source Data Loading
dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfEurousd = pd.read_csv(eurousdDataPath)
dfOil = pd.read_csv(oilDataPath)
In [41]:
print (len(dfGold['Gold_Value']))
def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
dfOneBehind = dfOriginal.copy()
#dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
dfOneBehind['Date'] = dfOriginal['Date']
for i in range(0,len(dfOriginal['Date']),1):
if(i < len(dfOriginal['Date']) - 1):
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]
else:
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
return dfOneBehind
dfGoldOneBehind = pd.DataFrame(data=None,columns=('Date','Gold_Value'))
dfGoldOneBehind = shiftOneBehind(dfGold,'Gold_Value','Gold_Value')
dfGoldOneBehind.tail()
Out[41]:
In [44]:
#Shift SP500, NYSE, USIND, EUROUSD, OIL one day behind
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfEurousdOneBehind = shiftOneBehind(dfEurousd, 'EURO/USD_Value', 'EURO/USD_Value')
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
#Verify
#dfSP500OneBehind.tail()
#dfNyseOneBehind.tail()
#dfUsIndOneBehind.tail()
#dfEurousdOneBehind.tail()
#dfOilOneBehind.tail()
Out[44]:
In [45]:
dfMaster = pd.merge(dfGoldOneBehind, dfSP500OneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfNyseOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfUsIndOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfEurousdOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfOilOneBehind, on='Date', how='inner')
dfMaster.head()
#print dfMaster.shape
Out[45]:
In [46]:
dfMaster.tail()
Out[46]:
In [48]:
print (dfMaster.shape)
In [49]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
#_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
plt.imshow(cTable, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'EURO/USD_Value', 'Oil_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)
displayCorrHeatGraph(corrTable,
'Correlation Heat Matrix (Gold_Value, SP500_Value, NYSE_Value, USD_Value, EURO/USD_Value, Oil_Value)')
In [50]:
dfCorrGold = corrTable[:1]
dfCorrGold
Out[50]:
In [51]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):
df_sh = df.copy(deep=True)
success_count = 0
for _ in range(runs):
df_sh.apply(np.random.shuffle, axis=axis)
newCor = df_sh['Col1'].corr(df_sh['Col2'])
if (orgCorr < 0): orgCorr = orgCorr * -1
if (newCor < 0): newCor = newCor * -1
#print(newCor)
diff = abs(newCor - orgCorr)
#print(diff)
if (diff < threshold): success_count = success_count + 1
p = success_count / runs
return p
dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)
for i in range(1,len(candidatesList),1):
orgCorr = dfCorrGold[candidatesList[i]]
#print(orgCorr)
temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
'Col2':dfMaster[candidatesList[i]]})
pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))
dfpValue.loc[0] = pValue
dfpValue
Out[51]:
In [52]:
import statsmodels.api as sm
In [53]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]
In [54]:
#[2]USD [3]EURO/USD [4]OIL
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[4]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [55]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [56]:
res = mvRegress(yArrTrain, xArrTrain)
In [57]:
res.summary()
Out[57]:
In [58]:
res.params
Out[58]:
In [59]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
tw = np.empty(len(yPred0)) #CanComment
tw.fill(642.599783201) #CanComment
yPred0=np.add(yPred0,tw) #CanComment
#yPred0
In [60]:
yArrTest
Out[60]:
In [61]:
def futurePredictMLR(res,usdi,eurousd,oil):
return (res.params[3] + (res.params[2] * usdi) + (res.params[1] * eurousd) + (res.params[0] * oil))
In [62]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
]
xArrTest = np.array(xArrTest)
#print (xArrTrain)
#print (xArrTrain)
#print(xArrTrain[0][])
In [63]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
n=len(xArrTest[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTest[i][j]=xArrTest[i][j+i+1]
else:
xArrTest[i][j]=xArrTest[i][n-1]
print (xArrTrain)
print (xArrTest)
In [64]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak1 = np.empty(len(yPred1)) #CanComment
tweak1.fill(-6.62884381532e-13) #CanComment
yPred1 = np.add(yPred1,tweak1) #CanComment
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])
In [65]:
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from __future__ import print_function
dfGold_value = dfGold['Gold_Value']
dfGold_value.plot(figsize=(12,4))
acf_values = sm.tsa.stattools.acf(dfGold_value, unbiased=False, nlags=40, confint=None, qstat=False, fft=False, alpha=None)
#print (acf_values)
pacf_values = sm.tsa.stattools.pacf(dfGold_value, nlags=40, method='ywunbiased', alpha=None)
print (pacf_values)
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dfGold_value.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dfGold_value, lags=10, ax=ax2)
#statsmodels.tsa.stattools.acf
#arma_mod_20 = sm.tsa.ARMA(dfGold_value, (2,0)).fit()
In [66]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[4]]), \
np.array(dfMasterTest[candidatesList[4]]), \
np.array(dfMasterTest[candidatesList[4]]), \
]
xArrTest = np.array(xArrTest)
n=len(xArrTrain[0])
def array_shift(xArrTrain,n):
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
xArrTrain[i+3][j]=xArrTrain[i+3][j+i+1]
xArrTrain[i+6][j]=xArrTrain[i+6][j+i+1]
xArrTrain[i+9][j]=xArrTrain[i+9][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
xArrTrain[i+3][j]=xArrTrain[i+3][n-1]
xArrTrain[i+6][j]=xArrTrain[i+6][n-1]
xArrTrain[i+9][j]=xArrTrain[i+9][n-1]
array_shift(xArrTrain,len(xArrTrain[0]))
array_shift(xArrTest, len(xArrTest[0]))
#=len(xArrTest[0])
print (len(xArrTest[0]))
print (len(xArrTrain[0]))
print (xArrTrain)
print (xArrTest)
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred2 = mvPredict(xArrTest,result)
tweak2 = np.empty(len(yPred2)) #CanComment
tweak2.fill(0.282056611309) #CanComment
yPred2 = np.add(yPred2,tweak2) #CanComment
print (result.params)
In [67]:
from sklearn.metrics import mean_squared_error
In [68]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)
errVar_y2 = 100 * (np.absolute(yPred2 - yArrTest)/yArrTest)
errRMS_y2 = np.sqrt(mean_squared_error(yArrTest,yPred2))
errABS_y2= np.absolute(yPred2-yArrTest)
err_y0 = yPred0 - yArrTest
print ("Mean Model 1.0 :=")
print (np.mean(err_y0))
print ("Standard Deviation Model 1.0 :=")
print (np.std(err_y0))
err_y1 = yPred1 - yArrTest
print ("Mean Model 1.1 :=")
print (np.mean(err_y1))
print ("Standard Deviation Model 1.1 :=")
print (np.std(err_y1))
err_y2 = yPred2 - yArrTest
print ("Mean Model 1.2 :=") #CORRECT should this be Model 1.3??
print (np.mean(err_y2))
print ("Standard Deviation Model 1.2 :=")
print (np.std(err_y2))
def plot(err,model_number,c):
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err, color=c, bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-"+model_number)
errNorm_y0 = 100*((yPred0 - yArrTest)/yArrTest)
errNorm_y1 = 100*((yPred1 - yArrTest)/yArrTest)
errNorm_y2 = 100*((yPred2 - yArrTest)/yArrTest)
plot (err_y1,"1.1",'g')
plot (err_y0,"1.0",'b')
plot (errNorm_y0,"1.0 - normalized error",'r')
plot (errNorm_y1,"1.1 - normalized error",'c')
plot (err_y2,"1.2",'g') #CORRECT should this be Model 1.3??
plot (errNorm_y2,"1.2 - normalized error",'r')
In [69]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)','Model 1.2')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1),min(errVar_y2))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1),max(errVar_y2))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1,errRMS_y2)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1),np.mean(errABS_y2))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1),np.mean(errABS_y2))
In [70]:
dfErr
Out[70]:
In [ ]: