In [8]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge
In [9]:
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [10]:
dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfGold=pd.read_csv(goldDataPath)
In [11]:
dfOil.max()
Out[11]:
In [12]:
dfOil.min()
Out[12]:
In [16]:
print(dfNyse.head())
#print(np.linalg.norm(dfNyse['NYSE_Value']))
dfNyse['NYSE_Value'] = (dfNyse['NYSE_Value'] - dfNyse['NYSE_Value'].mean())*100 / (dfNyse['NYSE_Value'].max() - dfNyse['NYSE_Value'].min())
print(dfNyse.head())
In [7]:
print(dfGold.head())
dfGold['Gold_Value'] = (dfGold['Gold_Value'] - dfGold['Gold_Value'].mean())*100 / (dfGold['Gold_Value'].max() - dfGold['Gold_Value'].min())
print(dfGold.head())
In [7]:
In [8]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
dfMaster = merge(dfMaster,dfGold,on='Date',how='inner')
In [9]:
dfMaster.head()
Out[9]:
In [10]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
plt.imshow(cTable, cmap='PRGn', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Oil_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'Gold_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value,Gold_Value)')
In [11]:
dfCorrOil = corrTable[:1]
dfCorrOil
Out[11]:
In [12]:
import statsmodels.api as sm
In [13]:
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:np.int(trainSize)]
In [23]:
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
#np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
#np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[4]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
#np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[2]]), \
#np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[4]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [25]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [26]:
res = mvRegress(yArrTrain, xArrTrain)
In [27]:
res.summary()
Out[27]:
In [28]:
yPred0 = mvPredict(xArrTest,res)
yPred0
Out[28]:
In [29]:
from sklearn.metrics import mean_squared_error
from math import sqrt
In [30]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
In [31]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Polynomial Fit)')
dfErr['Minimum % Error'] = (min(errVar_y0),0)
dfErr['Maximum % Error'] = (max(errVar_y0),0)
dfErr['RMSE Error'] = (errRMS_y0,0)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),0)
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),0)
In [32]:
dfErr
Out[32]:
In [22]: