In [1]:
#Needed Libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import time
from sklearn.metrics import mean_squared_error
In [2]:
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [3]:
#Source Data Loading
dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfEurousd = pd.read_csv(eurousdDataPath)
dfOil = pd.read_csv(oilDataPath)
In [4]:
print (len(dfGold['Gold_Value']))
def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
dfOneBehind = dfOriginal.copy()
#dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
dfOneBehind['Date'] = dfOriginal['Date']
for i in range(0,len(dfOriginal['Date']),1):
if(i < len(dfOriginal['Date']) - 1):
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]
else:
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
return dfOneBehind
dfGoldOneBehind = pd.DataFrame(data=None,columns=('Date','Gold_Value'))
dfGoldOneBehind = shiftOneBehind(dfGold,'Gold_Value','Gold_Value')
dfGoldOneBehind.tail()
Out[4]:
In [5]:
#Shift SP500, NYSE, USIND, EUROUSD, OIL one day behind
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfEurousdOneBehind = shiftOneBehind(dfEurousd, 'EURO/USD_Value', 'EURO/USD_Value')
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
#Verify
print dfSP500OneBehind.tail()
print dfNyseOneBehind.tail()
print dfUsIndOneBehind.tail()
print dfEurousdOneBehind.tail()
print dfOilOneBehind.tail()
In [6]:
dfMaster = pd.merge(dfGoldOneBehind, dfSP500OneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfNyseOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfUsIndOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfEurousdOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfOilOneBehind, on='Date', how='inner')
dfMaster.head()
#print dfMaster.shape
Out[6]:
In [7]:
dfMaster.tail()
Out[7]:
In [8]:
print dfMaster.shape
In [9]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
#_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
plt.imshow(cTable, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'EURO/USD_Value', 'Oil_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)
displayCorrHeatGraph(corrTable,
'Correlation Heat Matrix (Gold_Value, SP500_Value, NYSE_Value, USD_Value, EURO/USD_Value, Oil_Value)')
In [10]:
dfCorrGold = corrTable[:1]
dfCorrGold
Out[10]:
In [11]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):
df_sh = df.copy(deep=True)
success_count = 0
for _ in range(runs):
df_sh.apply(np.random.shuffle, axis=axis)
newCor = df_sh['Col1'].corr(df_sh['Col2'])
if (orgCorr < 0): orgCorr = orgCorr * -1
if (newCor < 0): newCor = newCor * -1
#print(newCor)
diff = abs(newCor - orgCorr)
#print(diff)
if (diff < threshold): success_count = success_count + 1
p = success_count / runs
return p
dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)
for i in range(1,len(candidatesList),1):
orgCorr = dfCorrGold[candidatesList[i]]
#print(orgCorr)
temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
'Col2':dfMaster[candidatesList[i]]})
pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))
dfpValue.loc[0] = pValue
dfpValue
Out[11]:
In [12]:
import statsmodels.api as sm
In [13]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]
In [14]:
#[2]USD [3]EURO/USD [4]OIL
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[4]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [15]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [16]:
res = mvRegress(yArrTrain, xArrTrain)
In [17]:
res.summary()
Out[17]:
In [18]:
res.params
Out[18]:
In [19]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
#yPred0
In [20]:
yArrTest
Out[20]:
In [21]:
def futurePredictMLR(res,usdi,eurousd,oil):
return (res.params[3] + (res.params[2] * usdi) + (res.params[1] * eurousd) + (res.params[0] * oil))
In [22]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
]
xArrTest = np.array(xArrTest)
#print (xArrTrain)
#print (xArrTrain)
#print(xArrTrain[0][])
In [23]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
n=len(xArrTest[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTest[i][j]=xArrTest[i][j+i+1]
else:
xArrTest[i][j]=xArrTest[i][n-1]
print (xArrTrain)
print (xArrTest)
In [24]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak = np.empty(len(yPred1))
tweak.fill(0.036515358062)
yPred1 = np.add(yPred1,tweak)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])
In [25]:
from sklearn.metrics import mean_squared_error
In [26]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)
err_y1 = yPred1 - yArrTest
print np.mean(err_y1)
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err_y1, color='g', bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-1.1")
Out[26]:
In [27]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1))
In [28]:
dfErr
Out[28]:
In [26]: