In [1]:
print("Linear Regression Model")


Linear Regression Model

In [2]:
#Needed Libraries
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge

In [3]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [4]:
#Source Data Loading

dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)

In [5]:
dfOil.head()


Out[5]:
Date Oil_Value
0 9/30/2014 91.17
1 9/29/2014 94.53
2 9/26/2014 95.55
3 9/25/2014 93.59
4 9/24/2014 93.60

In [6]:
dfSP500.head()


Out[6]:
Date SP500_Value
0 9/30/2014 1972.29
1 9/29/2014 1977.80
2 9/26/2014 1982.85
3 9/25/2014 1965.99
4 9/24/2014 1998.30

In [7]:
dfNyse.head()


Out[7]:
Date NYSE_Value
0 9/30/2014 10702.93
1 9/29/2014 10749.05
2 9/26/2014 10798.88
3 9/25/2014 10722.21
4 9/24/2014 10885.60

In [8]:
dfUsInd.head()


Out[8]:
Date USD_Value
0 9/30/2014 81.3001
1 9/29/2014 80.9136
2 9/26/2014 80.9983
3 9/25/2014 80.5957
4 9/24/2014 80.4465

In [9]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')

In [10]:
dfMaster.head()


Out[10]:
Date Oil_Value SP500_Value NYSE_Value USD_Value
0 9/30/2014 91.17 1972.29 10702.93 81.3001
1 9/29/2014 94.53 1977.80 10749.05 80.9136
2 9/26/2014 95.55 1982.85 10798.88 80.9983
3 9/25/2014 93.59 1965.99 10722.21 80.5957
4 9/24/2014 93.60 1998.30 10885.60 80.4465

In [11]:
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Oil_Value',	'SP500_Value',	'NYSE_Value',	'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value)')



In [12]:
dfCorrOil = corrTable[:1]
dfCorrOil


Out[12]:
Oil_Value SP500_Value NYSE_Value USD_Value
Oil_Value 1 0.626521 0.789685 -0.804935

More the p-Value, lesser the significance


In [16]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrOil[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue


Out[16]:
Oil_Value SP500_Value NYSE_Value USD_Value
0 0 0 0 0

As the pValues are all zeros, The factors does not correlates by chance.

Model 1.0: Multivariate Linear Regression


In [251]:
import statsmodels.api as sm

In [252]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:np.int(trainSize)]

In [253]:
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [254]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [255]:
res = mvRegress(yArrTrain, xArrTrain)

In [256]:
res.summary()


Out[256]:
OLS Regression Results
Dep. Variable: y R-squared: 0.817
Model: OLS Adj. R-squared: 0.816
Method: Least Squares F-statistic: 4422.
Date: Mon, 27 Oct 2014 Prob (F-statistic): 0.00
Time: 06:03:32 Log-Likelihood: -9853.8
No. Observations: 2985 AIC: 1.972e+04
Df Residuals: 2981 BIC: 1.974e+04
Df Model: 3
coef std err t P>|t| [95.0% Conf. Int.]
x1 -0.4984 0.017 -28.652 0.000 -0.532 -0.464
x2 0.0182 0.000 66.455 0.000 0.018 0.019
x3 -0.0625 0.002 -40.188 0.000 -0.066 -0.059
const 36.9946 1.580 23.417 0.000 33.897 40.092
Omnibus: 19.455 Durbin-Watson: 0.017
Prob(Omnibus): 0.000 Jarque-Bera (JB): 19.647
Skew: -0.190 Prob(JB): 5.42e-05
Kurtosis: 2.887 Cond. No. 7.93e+04

In [257]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
yPred0


Out[257]:
array([ 68.12932771,  68.81741134,  69.36694532, ...,  17.65250357,
        18.59926649,  18.22854643])

In [258]:
yArrTest


Out[258]:
array([ 91.17,  94.53,  95.55, ...,  29.31,  29.56,  29.65])

In [259]:
def futurePredictMLR(res,sp500,nyse,usdi):
    return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))

Future Oil Price (By Model 1.0 - Multivariate Linear Regression)

OP(i) = (3.69945988e+01) + ( (-6.24695329e-02) * sp500(i) ) + ( (1.82062382e-02) * nyse(i) ) + ( (-4.98367518e-01) * usdi(i) )

Model 1.1: Polynomial Fit Regression

Time Series Analysis - Auto Regression Models

Model 1.2: Univariate Autogressive Processes (AR)

Model 1.3: Autogressive Moving-Average Processes (ARMA) and Kalman Filter

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics


In [260]:
from sklearn.metrics import mean_squared_error

In [261]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

In [262]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Polynomial Fit)')
dfErr['Minimum % Error'] = (min(errVar_y0),0)
dfErr['Maximum % Error'] = (max(errVar_y0),0)
dfErr['RMSE Error'] = (errRMS_y0,0)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),0)
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),0)

In [263]:
dfErr


Out[263]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 0.005307 47.564602 21.85895 17.587581 21.750334
1 Model 1.1 (Polynomial Fit) 0.000000 0.000000 0.00000 0.000000 0.000000

In [263]: