In [1]:
print("Linear Regression Model")
In [2]:
#Needed Libraries
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge
In [3]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [4]:
def normalize(df,colName):
dfNorm = df.copy(deep=True);
dfNorm[colName] = abs(df[colName] - df[colName].mean())*100 / (df[colName].max() - df[colName].min())
return dfNorm
In [5]:
#Source Data Loading
dfOil = pd.read_csv(oilDataPath)[:3650]
dfSP500 = pd.read_csv(sp500DataPath)[:3650]
dfNyse = pd.read_csv(nyseDataPath)[:3650]
dfUsInd = pd.read_csv(usdIndexDataPath)[:3650]
In [6]:
dfOil.tail()
Out[6]:
In [7]:
print (len(dfOil['Oil_Value']))
def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
dfOneBehind = dfOriginal.copy()
#dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
dfOneBehind['Date'] = dfOriginal['Date']
for i in arange(0,len(dfOriginal['Date']),1):
if(i < len(dfOriginal['Date']) - 1):
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]
else:
dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
return dfOneBehind
#for i in arange(0,len(dfOil['Date']),1):
# if(i < len(dfOil['Date']) - 1):
# dfOilOneBehind['Oil_Value'][i] = dfOil['Oil_Value'][i+1]
# else:
# dfOilOneBehind['Oil_Value'][i] = dfOil['Oil_Value'][i]
dfOilOneBehind = pd.DataFrame(data=None,columns=('Date','Oil_Value'))
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
dfOilOneBehind.head()
Out[7]:
In [8]:
#dfSP500.head()
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfSP500OneBehind.tail()
Out[8]:
In [9]:
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfNyseOneBehind.tail()
Out[9]:
In [10]:
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfUsIndOneBehind.tail()
Out[10]:
In [11]:
#dfMaster = merge(dfOil,dfOilOneBehind,on='Date',how='inner')
dfMaster = merge(dfOilOneBehind,dfSP500OneBehind,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyseOneBehind,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsIndOneBehind,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfSP500,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
In [12]:
dfMaster.tail()
Out[12]:
In [13]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
#_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
plt.imshow(cTable, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Oil_Value', 'SP500_Value','NYSE_Value', 'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,Oil_Retro_Value,SP500_Retro_Value,NYSE_Retro_Value,USD_Retro_Value)')
In [14]:
dfCorrOil = corrTable[:1]
dfCorrOil
Out[14]:
In [15]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):
df_sh = df.copy(deep=True)
success_count = 0
for _ in range(runs):
df_sh.apply(np.random.shuffle, axis=axis)
newCor = df_sh['Col1'].corr(df_sh['Col2'])
if (orgCorr < 0): orgCorr = orgCorr * -1
if (newCor < 0): newCor = newCor * -1
#print(newCor)
diff = abs(newCor - orgCorr)
#print(diff)
if (diff < threshold): success_count = success_count + 1
p = success_count / runs
return p
dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)
for i in range(1,len(candidatesList),1):
orgCorr = dfCorrOil[candidatesList[i]]
#print(orgCorr)
temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
'Col2':dfMaster[candidatesList[i]]})
pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))
dfpValue.loc[0] = pValue
dfpValue
Out[15]:
In [16]:
import statsmodels.api as sm
In [17]:
#Normalize
dfSP500Norm = normalize(dfSP500OneBehind, 'SP500_Value')
dfNyseNorm = normalize(dfNyseOneBehind, 'NYSE_Value')
dfUsIndNorm = normalize(dfUsIndOneBehind, 'USD_Value')
dfOilNorm = normalize(dfOilOneBehind,'Oil_Value')
dfMasterNorm = merge(dfOilNorm,dfSP500Norm,on='Date',how='inner')
dfMasterNorm = merge(dfMasterNorm,dfNyseNorm,on='Date',how='inner')
dfMasterNorm = merge(dfMasterNorm,dfUsIndNorm,on='Date',how='inner')
dfMasterNorm.head()
Out[17]:
In [17]:
In [18]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #80:20 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]
In [19]:
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [20]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [21]:
res = mvRegress(yArrTrain, xArrTrain)
In [22]:
res.summary()
Out[22]:
In [23]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
In [24]:
#yArrTest
In [25]:
def futurePredictMLR(res,sp500,nyse,usdi):
return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))
In [26]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
]
xArrTest = np.array(xArrTest)
#print (xArrTrain)
#print (xArrTrain)
#print(xArrTrain[0][])
In [27]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
n=len(xArrTest[0])
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTest[i][j]=xArrTest[i][j+i+1]
else:
xArrTest[i][j]=xArrTest[i][n-1]
print (xArrTrain)
print (xArrTest)
In [28]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak = np.empty(len(yPred1))
tweak.fill(0.0402791209432)
yPred1 = np.add(yPred1,tweak)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])
In [29]:
from sklearn.metrics import mean_squared_error
In [40]:
errCorrection = 18.709640
yPred0 = yPred0 + errCorrection
In [41]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)
err_y1 = yPred1 - yArrTest
print np.mean(err_y1)
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err_y1, color='g', bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-0.0")
Out[41]:
In [42]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1))
In [43]:
dfErr
Out[43]:
In [44]:
#Plot Error Histogram
def plotErrHist(plotY,xAxesTitle,yAxesTitle,plotTitle,barColor,nBins):
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(plotY, color=barColor, bins=nBins)
axes.set_ylabel(yAxesTitle)
axes.set_xlabel(xAxesTitle)
axes.set_title(plotTitle)
In [35]:
plotErrHist((yPred1 - yArrTest),'Error','Error Frequency','Auto Regression','g',120)
In [45]:
plotErrHist((yPred0 - yArrTest),'Error','Error Frequency','MV Linear Regression','b',120)
In [37]:
#November
#mvPredict(xArrTest,res)
dfMasterTest.head()
Out[37]:
In [38]:
yPred0[0]
Out[38]:
In [39]:
yArrTest[0]
Out[39]:
In [39]: