In [1]:
print("Linear Regression Model")


Linear Regression Model

In [2]:
#Needed Libraries
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge

In [3]:
#Input Variables
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [4]:
def normalize(df,colName):
    dfNorm = df.copy(deep=True);
    dfNorm[colName] = abs(df[colName] - df[colName].mean())*100 / (df[colName].max() - df[colName].min())
    return dfNorm

In [5]:
#Source Data Loading

dfOil = pd.read_csv(oilDataPath)[:3650]
dfSP500 = pd.read_csv(sp500DataPath)[:3650]
dfNyse = pd.read_csv(nyseDataPath)[:3650]
dfUsInd = pd.read_csv(usdIndexDataPath)[:3650]

In [6]:
dfOil.tail()


Out[6]:
Date Oil_Value
3645 3/24/2000 27.86
3646 3/23/2000 27.47
3647 3/22/2000 27.28
3648 3/21/2000 28.01
3649 3/20/2000 29.33

In [7]:
print (len(dfOil['Oil_Value']))

def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
    dfOneBehind = dfOriginal.copy()
    #dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
    dfOneBehind['Date'] = dfOriginal['Date']
    for i in arange(0,len(dfOriginal['Date']),1):
        if(i < len(dfOriginal['Date']) - 1):
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]        
        else:
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
    return dfOneBehind

#for i in arange(0,len(dfOil['Date']),1):
#        if(i < len(dfOil['Date']) - 1):
#            dfOilOneBehind['Oil_Value'][i] = dfOil['Oil_Value'][i+1]        
#        else:
#            dfOilOneBehind['Oil_Value'][i] = dfOil['Oil_Value'][i]
dfOilOneBehind = pd.DataFrame(data=None,columns=('Date','Oil_Value'))
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
dfOilOneBehind.head()


3650
Out[7]:
Date Oil_Value
0 9/30/2014 94.53
1 9/29/2014 95.55
2 9/26/2014 93.59
3 9/25/2014 93.60
4 9/24/2014 91.55

In [8]:
#dfSP500.head()
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfSP500OneBehind.tail()


Out[8]:
Date SP500_Value
3645 4/3/2000 1498.58
3646 3/31/2000 1487.92
3647 3/30/2000 1508.52
3648 3/29/2000 1507.73
3649 3/28/2000 1507.73

In [9]:
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfNyseOneBehind.tail()


Out[9]:
Date NYSE_Value
3645 4/3/2000 6848.61
3646 3/31/2000 6827.14
3647 3/30/2000 6866.16
3648 3/29/2000 6824.92
3649 3/28/2000 6824.92

In [10]:
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfUsIndOneBehind.tail()


Out[10]:
Date USD_Value
3645 4/4/2000 98.5396
3646 4/3/2000 98.0586
3647 3/31/2000 98.6105
3648 3/30/2000 98.9015
3649 3/29/2000 98.9015

In [11]:
#dfMaster = merge(dfOil,dfOilOneBehind,on='Date',how='inner')
dfMaster = merge(dfOilOneBehind,dfSP500OneBehind,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyseOneBehind,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsIndOneBehind,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfSP500,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')

In [12]:
dfMaster.tail()


Out[12]:
Date Oil_Value SP500_Value NYSE_Value USD_Value
3605 4/4/2000 26.28 1505.97 6975.07 98.5396
3606 4/3/2000 26.86 1498.58 6848.61 98.0586
3607 3/31/2000 26.67 1487.92 6827.14 98.6105
3608 3/30/2000 26.36 1508.52 6866.16 98.9015
3609 3/29/2000 27.10 1507.73 6824.92 98.9015

In [13]:
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Oil_Value', 'SP500_Value','NYSE_Value',	'USD_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,Oil_Retro_Value,SP500_Retro_Value,NYSE_Retro_Value,USD_Retro_Value)')


             Oil_Value  SP500_Value  NYSE_Value  USD_Value
Oil_Value     1.000000     0.563587    0.744990  -0.865215
SP500_Value   0.563587     1.000000    0.913836  -0.330466
NYSE_Value    0.744990     0.913836    1.000000  -0.586677
USD_Value    -0.865215    -0.330466   -0.586677   1.000000

In [14]:
dfCorrOil = corrTable[:1]
dfCorrOil


Out[14]:
Oil_Value SP500_Value NYSE_Value USD_Value
Oil_Value 1 0.563587 0.74499 -0.865215

More the p-Value, lesser the significance


In [15]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrOil[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue


Out[15]:
Oil_Value SP500_Value NYSE_Value USD_Value
0 0 0 0 0

As the pValues are all zeros, The factors does not correlates by chance.

Model 1.0: Multiple Linear Regression


In [16]:
import statsmodels.api as sm

In [17]:
#Normalize

dfSP500Norm = normalize(dfSP500OneBehind, 'SP500_Value')
dfNyseNorm = normalize(dfNyseOneBehind, 'NYSE_Value')
dfUsIndNorm = normalize(dfUsIndOneBehind, 'USD_Value')
dfOilNorm = normalize(dfOilOneBehind,'Oil_Value')

dfMasterNorm = merge(dfOilNorm,dfSP500Norm,on='Date',how='inner')
dfMasterNorm = merge(dfMasterNorm,dfNyseNorm,on='Date',how='inner')
dfMasterNorm = merge(dfMasterNorm,dfUsIndNorm,on='Date',how='inner')

dfMasterNorm.head()


Out[17]:
Date Oil_Value SP500_Value NYSE_Value USD_Value
0 9/30/2014 22.942100 53.221966 46.798044 7.187334
1 9/29/2014 23.740160 53.600292 47.522485 6.999537
2 9/26/2014 22.206634 52.337210 46.407838 7.892186
3 9/25/2014 22.214458 54.757743 48.783241 8.222994
4 9/24/2014 20.610515 53.594299 47.762947 8.815433

In [17]:


In [18]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #80:20 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]

In [19]:
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [20]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [21]:
res = mvRegress(yArrTrain, xArrTrain)

In [22]:
res.summary()


Out[22]:
OLS Regression Results
Dep. Variable: y R-squared: 0.775
Model: OLS Adj. R-squared: 0.775
Method: Least Squares F-statistic: 2485.
Date: Tue, 11 Nov 2014 Prob (F-statistic): 0.00
Time: 21:58:48 Log-Likelihood: -8583.1
No. Observations: 2166 AIC: 1.717e+04
Df Residuals: 2162 BIC: 1.720e+04
Df Model: 3
coef std err t P>|t| [95.0% Conf. Int.]
x1 -1.0152 0.043 -23.667 0.000 -1.099 -0.931
x2 0.0133 0.001 19.584 0.000 0.012 0.015
x3 -0.0462 0.004 -11.555 0.000 -0.054 -0.038
const 102.9455 4.958 20.763 0.000 93.222 112.669
Omnibus: 600.449 Durbin-Watson: 0.013
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1883.928
Skew: 1.391 Prob(JB): 0.00
Kurtosis: 6.624 Cond. No. 1.34e+05

In [23]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)

In [24]:
#yArrTest

In [25]:
def futurePredictMLR(res,sp500,nyse,usdi):
    return (res.params[3] + (res.params[2] * sp500) + (res.params[1] * nyse) + (res.params[0] * usdi))

Future Oil Price (By Model 1.0 - Multiple Linear Regression)

OP(i) = (3.69945988e+01) + ( (-6.24695329e-02) * sp500(i-1) ) + ( (1.82062382e-02) * nyse(i-1) ) + ( (-4.98367518e-01) * usdi(i-1) )

Model 1.1: Simple Autoregressive Model based on the last three day prices only.


In [26]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         ]
xArrTest = np.array(xArrTest)




#print (xArrTrain)            
#print (xArrTrain)            
#print(xArrTrain[0][])

In [27]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTrain[i][j]=xArrTrain[i][j+i+1]
        else:
            xArrTrain[i][j]=xArrTrain[i][n-1]
            
n=len(xArrTest[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTest[i][j]=xArrTest[i][j+i+1]
        else:
            xArrTest[i][j]=xArrTest[i][n-1]   

print (xArrTrain)
print (xArrTest)


[[ 94.53  95.55  93.59 ...,  31.1   33.17  36.73]
 [ 94.53  95.55  93.59 ...,  31.1   33.17  36.73]
 [ 94.53  95.55  93.59 ...,  31.1   33.17  36.73]]
[[ 43.84  44.61  46.27 ...,  26.67  26.36  27.1 ]
 [ 43.84  44.61  46.27 ...,  26.67  26.36  27.1 ]
 [ 43.84  44.61  46.27 ...,  26.67  26.36  27.1 ]]
[[ 44.61  46.27  47.77 ...,  26.36  27.1   27.1 ]
 [ 46.27  47.77  43.1  ...,  27.1   27.1   27.1 ]
 [ 47.77  43.1   42.   ...,  27.1   27.1   27.1 ]]
[[ 95.55  93.59  93.6  ...,  33.17  36.73  36.73]
 [ 93.59  93.6   91.55 ...,  36.73  36.73  36.73]
 [ 93.6   91.55  91.46 ...,  36.73  36.73  36.73]]

In [28]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak = np.empty(len(yPred1))
tweak.fill(0.0402791209432)
yPred1 = np.add(yPred1,tweak)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])


0.0839880781765
0.92378409729
0.0149666765472
0.0597685807967

Time Series Analysis - Auto Regression Models

Model 1.2: Univariate Autogressive Processes (AR)

Model 1.3: Autogressive Moving-Average Processes (ARMA) and Kalman Filter

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics


In [29]:
from sklearn.metrics import mean_squared_error

In [40]:
errCorrection = 18.709640
yPred0 = yPred0 + errCorrection

In [41]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)

err_y1 = yPred1 - yArrTest
print np.mean(err_y1)


fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err_y1, color='g', bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-0.0")


-0.0502480479443
Out[41]:
<matplotlib.text.Text at 0xed80e80>

In [42]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1))

In [43]:
dfErr


Out[43]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 0.000705 144.296906 10.971953 8.549079 12.094818
1 Model 1.1 (Simple Autoregressive) 0.000171 13.380745 1.574621 1.180562 1.485672

In [44]:
#Plot Error Histogram
def plotErrHist(plotY,xAxesTitle,yAxesTitle,plotTitle,barColor,nBins):
    fig, axes = plt.subplots(1, 1, figsize=(12,4))
    axes.hist(plotY, color=barColor, bins=nBins)
    axes.set_ylabel(yAxesTitle)
    axes.set_xlabel(xAxesTitle)
    axes.set_title(plotTitle)

In [35]:
plotErrHist((yPred1 - yArrTest),'Error','Error Frequency','Auto Regression','g',120)



In [45]:
plotErrHist((yPred0 - yArrTest),'Error','Error Frequency','MV Linear Regression','b',120)


Actual Prediction


In [37]:
#November
#mvPredict(xArrTest,res)
dfMasterTest.head()


Out[37]:
Date Oil_Value SP500_Value NYSE_Value USD_Value
0 9/30/2014 94.53 1977.80 10749.05 80.9136
1 9/29/2014 95.55 1982.85 10798.88 80.9983
2 9/26/2014 93.59 1965.99 10722.21 80.5957
3 9/25/2014 93.60 1998.30 10885.60 80.4465
4 9/24/2014 91.55 1982.77 10815.42 80.1793

In [38]:
yPred0[0]


Out[38]:
72.307317227389177

In [39]:
yArrTest[0]


Out[39]:
94.530000000000001

In [39]: