In [190]:
print("Linear Regression Model")


Linear Regression Model

In [191]:
#Needed Libraries
%matplotlib inline
import numpy as np
import scipy as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge

In [192]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [193]:
#Source Data Loading

dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)

In [194]:
dfOil.head()


Out[194]:
Date Oil_Value
0 9/30/2014 91.17
1 9/29/2014 94.53
2 9/26/2014 95.55
3 9/25/2014 93.59
4 9/24/2014 93.60

5 rows × 2 columns


In [195]:
dfSP500.head()


Out[195]:
Date SP500_Value
0 9/30/2014 1972.29
1 9/29/2014 1977.80
2 9/26/2014 1982.85
3 9/25/2014 1965.99
4 9/24/2014 1998.30

5 rows × 2 columns


In [196]:
dfNyse.head()


Out[196]:
Date NYSE_Value
0 9/30/2014 10702.93
1 9/29/2014 10749.05
2 9/26/2014 10798.88
3 9/25/2014 10722.21
4 9/24/2014 10885.60

5 rows × 2 columns


In [197]:
dfUsInd.head()


Out[197]:
Date USD_Value
0 9/30/2014 81.3001
1 9/29/2014 80.9136
2 9/26/2014 80.9983
3 9/25/2014 80.5957
4 9/24/2014 80.4465

5 rows × 2 columns


In [198]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')

In [199]:
dfMaster.head()


Out[199]:
Date Oil_Value SP500_Value NYSE_Value USD_Value
0 9/30/2014 91.17 1972.29 10702.93 81.3001
1 9/29/2014 94.53 1977.80 10749.05 80.9136
2 9/26/2014 95.55 1982.85 10798.88 80.9983
3 9/25/2014 93.59 1965.99 10722.21 80.5957
4 9/24/2014 93.60 1998.30 10885.60 80.4465

5 rows × 5 columns


In [200]:
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Oil_Value',	'SP500_Value',	'NYSE_Value',	'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value)')



In [201]:
dfCorrOil = corrTable[:1]
dfCorrOil


Out[201]:
Oil_Value SP500_Value NYSE_Value USD_Value
Oil_Value 1 0.626521 0.789685 -0.804935

1 rows × 4 columns

More the p-Value, lesser the significance


In [202]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrOil[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue


Out[202]:
Oil_Value SP500_Value NYSE_Value USD_Value
0 0 0 0 0

1 rows × 4 columns

As the pValues are all zeros, The factors does not correlates by chance.

Model 1.0: Multiple Linear Regression


In [203]:
import statsmodels.api as sm

In [204]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #80:20 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]

In [205]:
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [206]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [207]:
res = mvRegress(yArrTrain, xArrTrain)

In [208]:
res.summary()


Out[208]:
OLS Regression Results
Dep. Variable: y R-squared: 0.817
Model: OLS Adj. R-squared: 0.816
Method: Least Squares F-statistic: 4422.
Date: Thu, 06 Nov 2014 Prob (F-statistic): 0.00
Time: 04:08:29 Log-Likelihood: -9853.8
No. Observations: 2985 AIC: 1.972e+04
Df Residuals: 2981 BIC: 1.974e+04
Df Model: 3
coef std err t P>|t| [95.0% Conf. Int.]
x1 -0.4984 0.017 -28.652 0.000 -0.532 -0.464
x2 0.0182 0.000 66.455 0.000 0.018 0.019
x3 -0.0625 0.002 -40.188 0.000 -0.066 -0.059
const 36.9946 1.580 23.417 0.000 33.897 40.092
Omnibus: 19.455 Durbin-Watson: 0.017
Prob(Omnibus): 0.000 Jarque-Bera (JB): 19.647
Skew: -0.190 Prob(JB): 5.42e-05
Kurtosis: 2.887 Cond. No. 7.93e+04

In [209]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
tw = np.empty(len(yPred0))
tw.fill(20.4288959475)
yPred0=np.add(yPred0,tw)
#yPred0

In [210]:
#yArrTest

In [211]:
def futurePredictMLR(res,sp500,nyse,usdi):
    return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))

Future Oil Price (By Model 1.0 - Multiple Linear Regression)

OP(i) = (3.69945988e+01) + ( (-6.24695329e-02) * sp500(i) ) + ( (1.82062382e-02) * nyse(i) ) + ( (-4.98367518e-01) * usdi(i) )

Model 1.1: Simple Autoregressive Model based on the last three day prices only.


In [212]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         ]

xArrTest = np.array(xArrTest)




#print (xArrTrain)            
#print (xArrTrain)            
#print(xArrTrain[0][])

In [213]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,4,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTrain[i][j]=xArrTrain[i][j+i+1]
        else:
            xArrTrain[i][j]=xArrTrain[i][n-1]
            
n=len(xArrTest[0])
for i in range(0,4,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTest[i][j]=xArrTest[i][j+i+1]
        else:
            xArrTest[i][j]=xArrTest[i][n-1]   

print (xArrTrain)
print (xArrTest)


[[ 91.17  94.53  95.55 ...,  59.91  58.69  58.23]
 [ 91.17  94.53  95.55 ...,  59.91  58.69  58.23]
 [ 91.17  94.53  95.55 ...,  59.91  58.69  58.23]
 [ 91.17  94.53  95.55 ...,  59.91  58.69  58.23]]
[[ 58.5   59.68  60.02 ...,  18.01  17.97  18.16]
 [ 58.5   59.68  60.02 ...,  18.01  17.97  18.16]
 [ 58.5   59.68  60.02 ...,  18.01  17.97  18.16]
 [ 58.5   59.68  60.02 ...,  18.01  17.97  18.16]]
[[ 59.68  60.02  59.53 ...,  17.97  18.16  18.16]
 [ 60.02  59.53  58.64 ...,  18.16  18.16  18.16]
 [ 59.53  58.64  60.96 ...,  18.16  18.16  18.16]
 [ 58.64  60.96  62.9  ...,  18.16  18.16  18.16]]
[[ 94.53  95.55  93.59 ...,  58.69  58.23  58.23]
 [ 95.55  93.59  93.6  ...,  58.23  58.23  58.23]
 [ 93.59  93.6   91.55 ...,  58.23  58.23  58.23]
 [ 93.6   91.55  91.46 ...,  58.23  58.23  58.23]]

In [214]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak1 = np.empty(len(yPred1))
tweak1.fill(0.0385522746912)
yPred1 = np.add(yPred1,tweak1)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])


0.960130922064
0.00316676486387
0.0173167845641
0.0187608682098

Time Series Analysis - Auto Regression Models

Model 1.2: Autoregressive Moving Average models.(ARMA)


In [215]:
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from __future__ import print_function

dfOil_value = dfOil['Oil_Value']
dfOil_value.plot(figsize=(12,4))

acf_values = sm.tsa.stattools.acf(dfOil_value, unbiased=False, nlags=40, confint=None, qstat=False, fft=False, alpha=None)
#print (acf_values)
pacf_values = sm.tsa.stattools.pacf(dfOil_value, nlags=40, method='ywunbiased', alpha=None)
print (pacf_values)
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dfOil_value.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dfOil_value, lags=10, ax=ax2)
#statsmodels.tsa.stattools.acf
#arma_mod_20 = sm.tsa.ARMA(dfOil_value, (2,0)).fit()


[ 1.          0.99912811  0.02249948  0.02998447 -0.02520848 -0.01969816
  0.05981421 -0.00856479  0.00788236  0.00949273 -0.01232917 -0.00266199
  0.01350931 -0.03329588 -0.0120559   0.00484517 -0.01540341 -0.00970453
  0.0160839  -0.0054594  -0.03468653 -0.01406609 -0.01279195  0.03179006
 -0.00723417 -0.01224124 -0.02427379  0.01520317 -0.0231513  -0.01248898
 -0.01761985  0.0083593  -0.00936983 -0.0236439   0.02795965  0.00334735
 -0.02408435 -0.01057808 -0.03187716 -0.00960917 -0.06561635]

Model 1.3: Multiple Regression using autoregressive parameters as well as other exogenous parameters.


In [216]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         ]

xArrTest = np.array(xArrTest)

n=len(xArrTrain[0])
def array_shift(xArrTrain,n):
    for i in range(0,3,1):
        for j in range(0,n,1):
            if(j<n-i-1):
                xArrTrain[i][j]=xArrTrain[i][j+i+1]
                xArrTrain[i+3][j]=xArrTrain[i+3][j+i+1]
                xArrTrain[i+6][j]=xArrTrain[i+6][j+i+1]
                xArrTrain[i+9][j]=xArrTrain[i+9][j+i+1]
            else:
                xArrTrain[i][j]=xArrTrain[i][n-1]
                xArrTrain[i+3][j]=xArrTrain[i+3][n-1]
                xArrTrain[i+6][j]=xArrTrain[i+6][n-1]
                xArrTrain[i+9][j]=xArrTrain[i+9][n-1]
 
            
array_shift(xArrTrain,len(xArrTrain[0]))
array_shift(xArrTest, len(xArrTest[0]))
#=len(xArrTest[0])

print (len(xArrTest[0]))
print (len(xArrTrain[0]))
print (xArrTrain)
print (xArrTest)
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred2 = mvPredict(xArrTest,result)
tweak2 = np.empty(len(yPred1))
tweak2.fill(0.0892416479015)
yPred2 = np.add(yPred2,tweak2)
print (result.params)


1990
2985
[[ 59.68    60.02    59.53   ...,  17.97    18.16    18.16  ]
 [ 60.02    59.53    58.64   ...,  18.16    18.16    18.16  ]
 [ 59.53    58.64    60.96   ...,  18.16    18.16    18.16  ]
 ..., 
 [ 82.5729  82.1892  82.2549 ...,  85.9173  85.9581  85.9581]
 [ 82.1892  82.2549  81.939  ...,  85.9581  85.9581  85.9581]
 [ 82.2549  81.939   81.7394 ...,  85.9581  85.9581  85.9581]]
[[ 94.53    95.55    93.59   ...,  58.69    58.23    58.23  ]
 [ 95.55    93.59    93.6    ...,  58.23    58.23    58.23  ]
 [ 93.59    93.6     91.55   ...,  58.23    58.23    58.23  ]
 ..., 
 [ 80.9136  80.9983  80.5957 ...,  83.2409  83.0824  83.0824]
 [ 80.9983  80.5957  80.4465 ...,  83.0824  83.0824  83.0824]
 [ 80.5957  80.4465  80.1793 ...,  83.0824  83.0824  83.0824]]
[  1.77774592e-06  -2.08231456e-02   1.65980463e-02   4.70805988e-04
  -8.34782763e-04   4.54737630e-04  -1.95729478e-03   1.90276049e-03
  -1.84763557e-04   3.22455778e-02   3.30926909e-03   9.58779182e-01
   3.06556315e-01]

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics


In [217]:
from sklearn.metrics import mean_squared_error

In [218]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest)/yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)

errVar_y2 = 100 * (np.absolute(yPred2 - yArrTest)/yArrTest)
errRMS_y2 = np.sqrt(mean_squared_error(yArrTest,yPred2))
errABS_y2= np.absolute(yPred2-yArrTest)

err_y0 = yPred0 - yArrTest
print ("Mean Model 1.0 :=")
print (np.mean(err_y0))
print ("Standard Deviation Model 1.0 :=")
print (np.std(err_y0))
err_y1 = yPred1 - yArrTest
#print 'Mean Model 1.1:='
print ("Mean Model 1.1 :=")
print (np.mean(err_y1))
print ("Standard Deviation Model 1.1 :=")
print (np.std(err_y1))

err_y2 = yPred2 - yArrTest
#print 'Mean Model 1.1:='
print ("Mean Model 1.2 :=")
print (np.mean(err_y2))
print ("Standard Deviation Model 1.2 :=")
print (np.std(err_y2))

def plot(err,model_number,c):
    fig, axes = plt.subplots(1, 1, figsize=(12,4))
    axes.hist(err, color=c, bins=120)
    axes.set_ylabel('Error Frequency')
    axes.set_xlabel('Error')
    axes.set_title("Error Variations Model-"+model_number)

errNorm_y0 = 100*((yPred0 - yArrTest)/yArrTest)
errNorm_y1 = 100*((yPred1 - yArrTest)/yArrTest)
errNorm_y2 = 100*((yPred2 - yArrTest)/yArrTest)
plot (err_y1,"1.1",'g')
plot (err_y0,"1.0",'b')
plot (errNorm_y0,"1.0 - normalized error",'r')
plot (errNorm_y1,"1.1 - normalized error",'c')
plot (err_y2,"1.2",'g')
plot (errNorm_y2,"1.2 - normalized error",'r')


Mean Model 1.0 :=
1.84235162023e-12
Standard Deviation Model 1.0 :=
16.7243130173
Mean Model 1.1 :=
-3.19387174441e-14
Standard Deviation Model 1.1 :=
1.85056654172
Mean Model 1.2 :=
1.69994671606e-14
Standard Deviation Model 1.2 :=
1.84083676985

In [219]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)','Model 1.2')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1),min(errVar_y2))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1),max(errVar_y2))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1,errRMS_y2)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1),np.mean(errABS_y2))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1),np.mean(errABS_y2))

In [220]:
dfErr


Out[220]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 0.015922 109.852184 16.724313 12.751749 16.501485
1 Model 1.1 (Simple Autoregressive) 0.001422 15.434670 1.850567 1.311035 1.627105
2 Model 1.2 0.000516 15.315319 1.840837 1.310016 1.310016

3 rows × 6 columns


In [220]: