In [2]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool
import cPickle as pkl
In [3]:
trainInput = open('trainData.pkl', 'rb')
trainData = pkl.load(trainInput)
trainInput.close()
In [4]:
testInput = open('testData.pkl', 'rb')
testData = pkl.load(testInput)
testInput.close()
In [5]:
# Extracts column headers from training data set and creates all possible pairs
tickerlist = sorted(trainData.columns.tolist())[1:]
tickerpairs = list(combinations(tickerlist,2))
print tickerpairs[:10] # Look at first 10 combinations of pairs
print len(tickerpairs) # Over 3 million possible pairs
In [6]:
# Parallel processing for mean sum-square-distance
def dist(tlist):
xname = tlist[0]
yname = tlist[1]
x = trainData[xname]/(trainData[xname][0])
y = trainData[yname]/(trainData[yname][0])
# Remove any na values
z = (x-y).dropna()
# Only consider pairs with most of the data present
if len(z) > 495:
return([(xname, yname)], sum(map(lambda z:z**2, z)))
else:
return()
if __name__ == '__main__':
trainDistPool = Pool(processes=4)
# Test just the first 100 pairs - remove [0:100] for full test - warning, takes a long time!!
trainDistResult = pd.DataFrame(trainDistPool.map(dist, tickerpairs[0:100]))
trainDistPool.close()
In [8]:
# Unzip github file
!gunzip -c trainDistResult.pkl.gz > trainDistResult.pkl
In [9]:
# Read results from pkl file
trainDistInput = open('trainDistResult.pkl', 'rb')
trainDistResult = pkl.load(trainDistInput)
trainDistInput.close()
In [10]:
smallssd = trainDistResult.sort(columns = 1)[0:100]
print smallssd
In [9]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(1)
i = 1
for a in smallssd.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Standardized Price')
plt.title('Time Series Plot of '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [12]:
# Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallssd.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Gap Price between '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [6]:
# Parallel processing for cointegration test:
# Note - this test will sometimes hang depending on system resources
# If it does, restart notebook and run this test prior to distance or correlation
def cointegration_test(y, x):
ctresult = stat.OLS(y, x).fit()
return(ts.adfuller(ctresult.resid))
def coint(cointTrainingTlist):
try:
cointTrainingXname = cointTrainingTlist[0]
cointTrainingYname = cointTrainingTlist[1]
trainCointX = trainData[cointTrainingXname]
trainCointY = trainData[cointTrainingYname]
if min(trainCointX.count(), trainCointY.count()) > 495:
trainxclean = trainCointX[trainCointX.notnull() & trainCointY.notnull()]
trainyclean = trainCointY[trainCointX.notnull() & trainCointY.notnull()]
trainxp = list(trainxclean)
trainyp = list(trainyclean)
return([(cointTrainingXname, cointTrainingYname)], cointegration_test(trainxp,trainyp)[1]) # Get the p-value of test for each pair
else:
return()
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
trainCointPool = Pool(processes=4)
# Test just the first 100 pairs - remove [0:100] for full test
trainCointResult = pd.DataFrame(trainCointPool.map(coint, tickerpairs[0:100]))
trainCointPool.close()
In [11]:
# Unzip github file
!gunzip -c trainCointResult.pkl.gz > trainCointResult.pkl
In [12]:
# Load results from pkl file
trainCointInput = open('trainCointResult.pkl', 'rb')
trainCointResult = pkl.load(trainCointInput)
trainCointInput.close()
In [14]:
smallCoint = trainCointResult.sort(columns = 1)[0:100]
print smallCoint.head()
In [23]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(3)
i = 1
for a in smallCoint.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [7]:
# Time series plot of standardized gap in stock price
plt.figure(4)
i = 1
for a in smallCoint.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Gap Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [58]:
# Parallel processing for correlation test
# Consider updating to take advantage of pandas.DataFrame.corr() but would need to change handling of output
def correlate(tlist):
try:
xname = tlist[0]
yname = tlist[1]
x = trainData[xname]
y = trainData[yname]
if min(x.count(), y.count()) > 490:
corrs = x.corr(y)
return([(xname, yname)], corrs)
else:
return()
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
trainCorrPool = Pool(processes=4)
# Test just the first 100 pairs - remove [0:100] for full test
trainCorrResult = pd.DataFrame(trainCorrPool.map(correlate, tickerpairs))
trainCorrPool.close()
In [15]:
# Unzip github file
!gunzip -c trainCorrResult.pkl.gz > trainCorrResult.pkl
In [16]:
#Read results from pkl file
trainCorrInput = open('trainCorrResult.pkl', 'rb')
trainCorrResult = pkl.load(trainCorrInput)
trainCorrInput.close()
pairscorrelated = trainCorrResult.sort(columns = 1, ascending=False)[0:5]
print pairscorrelated
In [17]:
trainCorrResult.columns = ['pair','corr']
trainCorrTop = trainCorrResult.dropna().sort(columns='corr', ascending=False)[0:100]
trainCorrTop.head()
Out[17]:
In [30]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(5)
i = 1
for a in trainCorrTop.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [10]:
# Time series plot of standardized gap in stock prices
plt.figure(6)
i = 1
for a in trainCorrTop.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = trainData[stock1]/(trainData[stock1][0])
pairsprice2 = trainData[stock2]/(trainData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Gap Price')
plt.title(stock1 +' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [18]:
# Parallel processing for mean sum-square-distance
def dist(tlist):
xname = tickerpairs[tlist][0]
yname = tickerpairs[tlist][1]
x = testData[xname]/(testData[xname][0])
y = testData[yname]/(testData[yname][0])
# Remove any NA values
z = (x-y).dropna()
# Only consider pairs with most of data available
if len(z) > 248:
return([(xname, yname)], sum(map(lambda z:z**2, z)))
else:
return()
if __name__ == '__main__':
testDistPool = Pool(processes=4)
# Tests just the number of pairs selected in smallssd
testDistResult = pd.DataFrame(testDistPool.map(dist, smallssd.index))
testDistPool.close()
In [19]:
plt.scatter(smallssd[1], testDistResult[1])
plt.vlines(.3,-2,15, colors='r')
plt.text(.31,11,'x = .3')
plt.xlabel('Training SSD')
plt.ylabel('Test SSD')
plt.title('Consistency of SSD Method')
plt.show()
In [47]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(1)
i = 1
for a in smallssd.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Standardized Price')
plt.title('Time Series Plot of '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #M ake the data labels on the x-axis vertical
plt.show()
In [48]:
#Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallssd.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
#Updated to reflect standardized price differential
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Gap Price between '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [9]:
# Parallel processing for cointegration test:
def cointegration_test(y, x):
result = stat.OLS(y, x).fit()
return(ts.adfuller(result.resid))
def coint(cointTestingTlist):
try:
cointTestingXname = tickerpairs[cointTestingTlist][0]
cointTestingYname = tickerpairs[cointTestingTlist][1]
testCointX = testData[cointTestingXname]
testCointY = testData[cointTestingYname]
if min(testCointX.count(), testCointY.count()) > 248:
testxclean = testCointX[testCointX.notnull() & testCointY.notnull()]
testyclean = testCointY[testCointX.notnull() & testCointY.notnull()]
testxp = list(testxclean)
testyp = list(testyclean)
return([(cointTestingXname, cointTestingYname)], cointegration_test(testxp,testyp)[1]) # Get the p-value of test for each pair
else:
return()
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
testCointPool = Pool(processes=4)
# Test just the first 100 pairs - remove [0:100] for full test
testCointResult = pd.DataFrame(testCointPool.map(coint, smallCoint.index))
testCointPool.close()
testsmallCoint = testCointResult.sort(columns = 1)[0:5]
print testsmallCoint
In [49]:
plt.scatter(smallCoint[1][0:5], testCointResult[1][0:5])
plt.xlim(-1e-25,1e-24)
plt.xlabel('Training Cointegration')
plt.ylabel('Test Cointegration')
plt.title('Consistency of Cointegration Method')
plt.show()
In [46]:
# Note scale of training p-values
smallCoint.head()
Out[46]:
In [31]:
# Note same pairs test p-values - 3 of 5 are NOT SIGNIFICANT
testCointResult.head()
Out[31]:
In [25]:
# Concatenate the cointegration results for further evaluation
df = pd.DataFrame(smallCoint.reset_index())
df['2'] = testCointResult[1]
df.dropna(axis=0,how='any')
df.columns = ['index','pair','train','test']
df.head()
Out[25]:
In [30]:
# Note that training data box plot does not even appear due to scale issues - the training p-values are so minute
bp = df.boxplot(column=['train','test'])
plt.show()
In [19]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(1)
i = 1
for a in smallCoint.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Standardized Price')
plt.title('Time Series Plot of '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [11]:
# Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallCoint.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
#Updated to reflect standardized price differential
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Gap Price between '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [30]:
# Parallel processing for correlation test
# Consider updating to take advantage of pandas.DataFrame.corr() but would need to change handling of output
def correlate(tlist):
try:
xname = tickerpairs[tlist][0]
yname = tickerpairs[tlist][1]
x = testData[xname]
y = testData[yname]
if min(x.count(), y.count()) > 240:
corrs = x.corr(y)
return([(xname, yname)], corrs)
else:
return()
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
testCorrPool = Pool(processes=4)
#Test just the first 100 pairs - remove [0:100] for full test
testCorrResult = pd.DataFrame(testCorrPool.map(correlate, trainCorrTop.index))
testCorrPool.close()
In [38]:
testCorrResult.columns = ['pair','corr']
plt.scatter(trainCorrTop['corr'], testCorrResult['corr'])
plt.vlines(.993,-1,1, colors='r')
plt.text(.994,0,'x = .993')
plt.xlabel('Training Correlation')
plt.ylabel('Test Correlation')
plt.title('Consistency of Correlation Method')
plt.show()
In [16]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(1)
i = 1
for a in trainCorrTop.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
plt.subplot(5,1,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Standardized Price')
plt.title('Time Series Plot of '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [17]:
#Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in trainCorrTop.index[0:5]:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
#Updated to reflect standardized price differential
pairsprice1 = testData[stock1]/(testData[stock1][0])
pairsprice2 = testData[stock2]/(testData[stock2][0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(5,1,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Gap Price between '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) # Make the data labels on the x-axis vertical
plt.show()
In [ ]: