In [8]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge

In [9]:
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'

trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [10]:
dfOil = pd.read_csv(oilDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfGold=pd.read_csv(goldDataPath)

In [11]:
dfOil.max()


Out[11]:
Date         9/9/2014
Oil_Value      145.31
dtype: object

In [12]:
dfOil.min()


Out[12]:
Date         1/10/1995
Oil_Value        10.82
dtype: object

In [16]:
print(dfNyse.head())

#print(np.linalg.norm(dfNyse['NYSE_Value']))

dfNyse['NYSE_Value'] = (dfNyse['NYSE_Value'] - dfNyse['NYSE_Value'].mean())*100 / (dfNyse['NYSE_Value'].max() - dfNyse['NYSE_Value'].min())
print(dfNyse.head())


        Date  NYSE_Value
0  9/30/2014    10702.93
1  9/29/2014    10749.05
2  9/26/2014    10798.88
3  9/25/2014    10722.21
4  9/24/2014    10885.60

[5 rows x 2 columns]
        Date  NYSE_Value
0  9/30/2014   46.045729
1  9/29/2014   46.586609
2  9/26/2014   47.170999
3  9/25/2014   46.271838
4  9/24/2014   48.188023

[5 rows x 2 columns]

In [7]:
print(dfGold.head())

dfGold['Gold_Value'] = (dfGold['Gold_Value'] - dfGold['Gold_Value'].mean())*100 / (dfGold['Gold_Value'].max() - dfGold['Gold_Value'].min())

print(dfGold.head())


        Date  Gold_Value
0  9/30/2014      1216.5
1  9/29/2014      1219.5
2  9/26/2014      1213.8
3  9/25/2014      1213.8
4  9/24/2014      1217.3

[5 rows x 2 columns]
        Date  Gold_Value
0  9/30/2014   31.961904
1  9/29/2014   32.144586
2  9/26/2014   31.797490
3  9/25/2014   31.797490
4  9/24/2014   32.010619

[5 rows x 2 columns]

In [7]:


In [8]:
dfMaster = merge(dfOil,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
dfMaster = merge(dfMaster,dfGold,on='Date',how='inner')

In [9]:
dfMaster.head()


Out[9]:
Date Oil_Value SP500_Value NYSE_Value USD_Value Gold_Value
0 9/30/2014 91.17 1972.29 46.045729 81.3001 31.961904
1 9/29/2014 94.53 1977.80 46.586609 80.9136 32.144586
2 9/26/2014 95.55 1982.85 47.170999 80.9983 31.797490
3 9/25/2014 93.59 1965.99 46.271838 80.5957 31.797490
4 9/24/2014 93.60 1998.30 48.188023 80.4465 32.010619

5 rows × 6 columns


In [10]:
#Corelation Heat Matrix
def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    plt.imshow(cTable, cmap='PRGn', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Oil_Value',	'SP500_Value',	'NYSE_Value',	'USD_Value',	'Gold_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
displayCorrHeatGraph(corrTable,'Correlation Heat Matrix (Oil_Value,SP500_Value,NYSE_Value,USD_Value,Gold_Value)')



In [11]:
dfCorrOil = corrTable[:1]
dfCorrOil


Out[11]:
Oil_Value SP500_Value NYSE_Value USD_Value Gold_Value
Oil_Value 1 0.626521 0.789685 -0.804935 0.860884

1 rows × 5 columns


In [12]:
import statsmodels.api as sm

In [13]:
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:np.int(trainSize)]

In [23]:
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         #np.array(dfMasterTrain[candidatesList[1]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         #np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         #np.array(dfMasterTest[candidatesList[1]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         #np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [25]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [26]:
res = mvRegress(yArrTrain, xArrTrain)

In [27]:
res.summary()


Out[27]:
OLS Regression Results
Dep. Variable: y R-squared: 0.858
Model: OLS Adj. R-squared: 0.858
Method: Least Squares F-statistic: 8997.
Date: Sat, 08 Nov 2014 Prob (F-statistic): 0.00
Time: 18:46:52 Log-Likelihood: -9473.0
No. Observations: 2985 AIC: 1.895e+04
Df Residuals: 2982 BIC: 1.897e+04
Df Model: 2
coef std err t P>|t| [95.0% Conf. Int.]
x1 1.9450 0.021 94.071 0.000 1.904 1.985
x2 0.4646 0.006 73.656 0.000 0.452 0.477
const 75.1768 0.423 177.629 0.000 74.347 76.007
Omnibus: 11.770 Durbin-Watson: 0.023
Prob(Omnibus): 0.003 Jarque-Bera (JB): 12.699
Skew: 0.110 Prob(JB): 0.00175
Kurtosis: 3.231 Cond. No. 106.

In [28]:
yPred0 = mvPredict(xArrTest,res)
yPred0


Out[28]:
array([ 158.73509486,  159.34170671,  158.93813936, ...,   18.42205423,
         19.13712712,   19.08672587])

Error Metrics


In [29]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [30]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

In [31]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Polynomial Fit)')
dfErr['Minimum % Error'] = (min(errVar_y0),0)
dfErr['Maximum % Error'] = (max(errVar_y0),0)
dfErr['RMSE Error'] = (errRMS_y0,0)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),0)
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),0)

In [32]:
dfErr


Out[32]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 0.006584 184.16397 54.612428 40.727774 49.963674
1 Model 1.1 (Polynomial Fit) 0.000000 0.00000 0.000000 0.000000 0.000000

2 rows × 6 columns


In [22]: