In [1]:
print("Linear Regression Model")
In [2]:
#Needed Libraries
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge
In [3]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [4]:
#Source Data Loading
dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
In [5]:
dfOil.head()
Out[5]:
In [6]:
dfSP500.head()
Out[6]:
In [7]:
dfNyse.head()
Out[7]:
In [8]:
dfUsInd.head()
Out[8]:
In [9]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
In [10]:
dfMaster.head()
Out[10]:
In [11]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
#_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
plt.imshow(cTable, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Oil_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value)')
In [12]:
dfCorrOil = corrTable[:1]
dfCorrOil
Out[12]:
In [16]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):
df_sh = df.copy(deep=True)
success_count = 0
for _ in range(runs):
df_sh.apply(np.random.shuffle, axis=axis)
newCor = df_sh['Col1'].corr(df_sh['Col2'])
if (orgCorr < 0): orgCorr = orgCorr * -1
if (newCor < 0): newCor = newCor * -1
#print(newCor)
diff = abs(newCor - orgCorr)
#print(diff)
if (diff < threshold): success_count = success_count + 1
p = success_count / runs
return p
dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)
for i in range(1,len(candidatesList),1):
orgCorr = dfCorrOil[candidatesList[i]]
#print(orgCorr)
temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
'Col2':dfMaster[candidatesList[i]]})
pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))
dfpValue.loc[0] = pValue
dfpValue
Out[16]:
In [251]:
import statsmodels.api as sm
In [252]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:np.int(trainSize)]
In [253]:
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [254]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [255]:
res = mvRegress(yArrTrain, xArrTrain)
In [256]:
res.summary()
Out[256]:
In [257]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
yPred0
Out[257]:
In [258]:
yArrTest
Out[258]:
In [259]:
def futurePredictMLR(res,sp500,nyse,usdi):
return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))
In [260]:
from sklearn.metrics import mean_squared_error
In [261]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
In [262]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Polynomial Fit)')
dfErr['Minimum % Error'] = (min(errVar_y0),0)
dfErr['Maximum % Error'] = (max(errVar_y0),0)
dfErr['RMSE Error'] = (errRMS_y0,0)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),0)
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),0)
In [263]:
dfErr
Out[263]:
In [263]: