In [190]:
print("Linear Regression Model")
In [191]:
#Needed Libraries
%matplotlib inline
import numpy as np
import scipy as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge
In [192]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10
pThreshold = 0.4
In [193]:
#Source Data Loading
dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
In [194]:
dfOil.head()
Out[194]:
In [195]:
dfSP500.head()
Out[195]:
In [196]:
dfNyse.head()
Out[196]:
In [197]:
dfUsInd.head()
Out[197]:
In [198]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
In [199]:
dfMaster.head()
Out[199]:
In [200]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
corrCandidates = datatable[candidatesList]
return corrCandidates.corr()
# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
#_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
plt.imshow(cTable, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
plt.yticks(range(len(cTable)), cTable.columns)
plt.title(title)
candidatesList = ['Oil_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value)')
In [201]:
dfCorrOil = corrTable[:1]
dfCorrOil
Out[201]:
In [202]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):
df_sh = df.copy(deep=True)
success_count = 0
for _ in range(runs):
df_sh.apply(np.random.shuffle, axis=axis)
newCor = df_sh['Col1'].corr(df_sh['Col2'])
if (orgCorr < 0): orgCorr = orgCorr * -1
if (newCor < 0): newCor = newCor * -1
#print(newCor)
diff = abs(newCor - orgCorr)
#print(diff)
if (diff < threshold): success_count = success_count + 1
p = success_count / runs
return p
dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)
for i in range(1,len(candidatesList),1):
orgCorr = dfCorrOil[candidatesList[i]]
#print(orgCorr)
temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
'Col2':dfMaster[candidatesList[i]]})
pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))
dfpValue.loc[0] = pValue
dfpValue
Out[202]:
In [203]:
import statsmodels.api as sm
In [204]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #80:20 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]
In [205]:
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
#np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
]
xArrTest = np.array(xArrTest)
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
In [206]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [207]:
res = mvRegress(yArrTrain, xArrTrain)
In [208]:
res.summary()
Out[208]:
In [209]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
tw = np.empty(len(yPred0))
tw.fill(20.4288959475)
yPred0=np.add(yPred0,tw)
#yPred0
In [210]:
#yArrTest
In [211]:
def futurePredictMLR(res,sp500,nyse,usdi):
return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))
In [212]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
]
xArrTest = np.array(xArrTest)
#print (xArrTrain)
#print (xArrTrain)
#print(xArrTrain[0][])
In [213]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,4,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
n=len(xArrTest[0])
for i in range(0,4,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTest[i][j]=xArrTest[i][j+i+1]
else:
xArrTest[i][j]=xArrTest[i][n-1]
print (xArrTrain)
print (xArrTest)
In [214]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak1 = np.empty(len(yPred1))
tweak1.fill(0.0385522746912)
yPred1 = np.add(yPred1,tweak1)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])
In [215]:
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from __future__ import print_function
dfOil_value = dfOil['Oil_Value']
dfOil_value.plot(figsize=(12,4))
acf_values = sm.tsa.stattools.acf(dfOil_value, unbiased=False, nlags=40, confint=None, qstat=False, fft=False, alpha=None)
#print (acf_values)
pacf_values = sm.tsa.stattools.pacf(dfOil_value, nlags=40, method='ywunbiased', alpha=None)
print (pacf_values)
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dfOil_value.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dfOil_value, lags=10, ax=ax2)
#statsmodels.tsa.stattools.acf
#arma_mod_20 = sm.tsa.ARMA(dfOil_value, (2,0)).fit()
In [216]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[1]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[2]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
np.array(dfMasterTrain[candidatesList[3]]), \
]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!!
xArrTest = [ \
#np.array(dfMasterTrain[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[0]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[1]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[2]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[3]]), \
np.array(dfMasterTest[candidatesList[3]]), \
]
xArrTest = np.array(xArrTest)
n=len(xArrTrain[0])
def array_shift(xArrTrain,n):
for i in range(0,3,1):
for j in range(0,n,1):
if(j<n-i-1):
xArrTrain[i][j]=xArrTrain[i][j+i+1]
xArrTrain[i+3][j]=xArrTrain[i+3][j+i+1]
xArrTrain[i+6][j]=xArrTrain[i+6][j+i+1]
xArrTrain[i+9][j]=xArrTrain[i+9][j+i+1]
else:
xArrTrain[i][j]=xArrTrain[i][n-1]
xArrTrain[i+3][j]=xArrTrain[i+3][n-1]
xArrTrain[i+6][j]=xArrTrain[i+6][n-1]
xArrTrain[i+9][j]=xArrTrain[i+9][n-1]
array_shift(xArrTrain,len(xArrTrain[0]))
array_shift(xArrTest, len(xArrTest[0]))
#=len(xArrTest[0])
print (len(xArrTest[0]))
print (len(xArrTrain[0]))
print (xArrTrain)
print (xArrTest)
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred2 = mvPredict(xArrTest,result)
tweak2 = np.empty(len(yPred1))
tweak2.fill(0.0892416479015)
yPred2 = np.add(yPred2,tweak2)
print (result.params)
In [217]:
from sklearn.metrics import mean_squared_error
In [218]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)
errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest)/yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)
errVar_y2 = 100 * (np.absolute(yPred2 - yArrTest)/yArrTest)
errRMS_y2 = np.sqrt(mean_squared_error(yArrTest,yPred2))
errABS_y2= np.absolute(yPred2-yArrTest)
err_y0 = yPred0 - yArrTest
print ("Mean Model 1.0 :=")
print (np.mean(err_y0))
print ("Standard Deviation Model 1.0 :=")
print (np.std(err_y0))
err_y1 = yPred1 - yArrTest
#print 'Mean Model 1.1:='
print ("Mean Model 1.1 :=")
print (np.mean(err_y1))
print ("Standard Deviation Model 1.1 :=")
print (np.std(err_y1))
err_y2 = yPred2 - yArrTest
#print 'Mean Model 1.1:='
print ("Mean Model 1.2 :=")
print (np.mean(err_y2))
print ("Standard Deviation Model 1.2 :=")
print (np.std(err_y2))
def plot(err,model_number,c):
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err, color=c, bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-"+model_number)
errNorm_y0 = 100*((yPred0 - yArrTest)/yArrTest)
errNorm_y1 = 100*((yPred1 - yArrTest)/yArrTest)
errNorm_y2 = 100*((yPred2 - yArrTest)/yArrTest)
plot (err_y1,"1.1",'g')
plot (err_y0,"1.0",'b')
plot (errNorm_y0,"1.0 - normalized error",'r')
plot (errNorm_y1,"1.1 - normalized error",'c')
plot (err_y2,"1.2",'g')
plot (errNorm_y2,"1.2 - normalized error",'r')
In [219]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)','Model 1.2')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1),min(errVar_y2))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1),max(errVar_y2))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1,errRMS_y2)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1),np.mean(errABS_y2))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1),np.mean(errABS_y2))
In [220]:
dfErr
Out[220]:
In [220]: