Stat222 Capstone Project - Pairs Trading

This notebook, data files, and plots can be accessed on GitHub at: https://github.com/jeffreydarling/222project

Import required libraries


In [2]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool
import cPickle as pkl

Using manually downloaded WRDS (Wharton) data

# Run first time only to extract data from wrds download file and create training and test data sets # Change cell from raw text to code to run # Unzip and import sorted stock data - currently using test file to manage load times !gunzip -c wrdsDownload.csv.gz | head -n1 > wrdsSorted.csv !gunzip -c wrdsDownload.csv.gz | tail -n +2 | sort -t, -k2,2 >> wrdsSorted.csv # Use absolute values to clean up incorrect negative values wrdsData = pd.io.parsers.read_csv("wrdsSorted.csv", sep = ",", index_col=[0,1], parse_dates=0, usecols = [1,2,3]).abs() # Known issue with usecols is wrong index names wrdsData.index.names = ['date', 'ticker'] wrdsData.head() # Sort and remove duplicate indices wrdsDirty = wrdsData.reset_index().groupby(wrdsData.index.names).first() wrdsClean = wrdsDirty.reset_index().pivot('date','ticker','PRC') #Create training and testing data sets trainData = wrdsClean.ix['20110103':'20121231'].dropna(axis=1, how='all') testData = wrdsClean.ix['20130102':'20131231']
# Run first time only to output training data set in pickle format # Change cell from raw text to code to run trainOutput = open('trainData.pkl', 'wb') pkl.dump(trainData, trainOutput, -1) trainOutput.close()
# Run first time only to output test data set in pickle format # Change cell from raw text to code to run testOutput = open('testData.pkl', 'wb') pkl.dump(testData, testOutput, -1) testOutput.close()

In [3]:
trainInput = open('trainData.pkl', 'rb')
trainData = pkl.load(trainInput)
trainInput.close()

In [4]:
testInput = open('testData.pkl', 'rb')
testData = pkl.load(testInput)
testInput.close()

How many pairs are there?


In [5]:
# Extracts column headers from training data set and creates all possible pairs
tickerlist = sorted(trainData.columns.tolist())[1:]
tickerpairs = list(combinations(tickerlist,2))
print tickerpairs[:10] # Look at first 10 combinations of pairs
print len(tickerpairs) # Over 3 million possible pairs


[('AAME', 'AAON'), ('AAME', 'AAPL'), ('AAME', 'AAWW'), ('AAME', 'AAXJ'), ('AAME', 'ABAX'), ('AAME', 'ABCB'), ('AAME', 'ABCD'), ('AAME', 'ABCO'), ('AAME', 'ABFS'), ('AAME', 'ABIO')]
3232153

Training Data Set

Distance Method


In [6]:
# Parallel processing for mean sum-square-distance
def dist(tlist):
    xname = tlist[0]
    yname = tlist[1]
    x = trainData[xname]/(trainData[xname][0])
    y = trainData[yname]/(trainData[yname][0])
    # Remove any na values
    z = (x-y).dropna()
    # Only consider pairs with most of the data present
    if len(z) > 495:
        return([(xname, yname)], sum(map(lambda z:z**2, z)))
    else:
        return()

if __name__ == '__main__':
    trainDistPool = Pool(processes=4)
    # Test just the first 100 pairs - remove [0:100] for full test - warning, takes a long time!!
    trainDistResult = pd.DataFrame(trainDistPool.map(dist, tickerpairs[0:100]))

trainDistPool.close()
# Save results to pkl file - only run once when analyzing full data set # Change cell from raw text to code to run trainDistOutput = open('trainDistResult.pkl', 'wb') pkl.dump(trainDistResult, trainDistOutput, -1) trainDistOutput.close() smallssd = trainDistResult.sort(columns = 1)[0:5] print smallssd
# Save a copy for github - only run once when analyzing full data set # Change cell from raw text to code to run !gzip -k trainDistResult.pkl

In [8]:
# Unzip github file
!gunzip -c trainDistResult.pkl.gz > trainDistResult.pkl

In [9]:
# Read results from pkl file
trainDistInput = open('trainDistResult.pkl', 'rb')
trainDistResult = pkl.load(trainDistInput)
trainDistInput.close()

In [10]:
smallssd = trainDistResult.sort(columns = 1)[0:100]
print smallssd


                        0         1
3221878    [(VONE, VTHR)]  0.013202
2958608    [(PRFZ, VTWV)]  0.078365
3217295    [(VCSH, VMBS)]  0.112965
2459008  [(LBTYA, LBTYK)]  0.125308
3225367    [(VTWG, VTWO)]  0.129157
3217279    [(VCSH, VGSH)]  0.144788
2958607    [(PRFZ, VTWO)]  0.153353
84471      [(ACWX, IFSM)]  0.164394
2832838    [(ONEQ, VONE)]  0.165975
2832839    [(ONEQ, VONG)]  0.169638
3222161    [(VONV, VTHR)]  0.184708
108404     [(ADRD, ADRU)]  0.191009
2832860    [(ONEQ, VTHR)]  0.214025
3216758    [(VCIT, VGIT)]  0.230499
2483677  [(LINTA, LINTB)]  0.235668
2687682    [(MSEX, YORW)]  0.239340
3225483    [(VTWO, VTWV)]  0.256559
10988      [(AAXJ, FCHI)]  0.258966
3012557    [(QQXT, VONG)]  0.260118
3221858    [(VONE, VONV)]  0.264960
2254103    [(IFGL, VNQI)]  0.267773
3221857    [(VONE, VONG)]  0.274490
1787334     [(FDV, QQEW)]  0.292310
1787886     [(FDV, VTWG)]  0.319899
573801     [(BANF, QABA)]  0.324042
105904     [(ADRA, ADRE)]  0.335971
958375    [(CENT, CENTA)]  0.336306
81264       [(ACWI, CHW)]  0.339927
3009253    [(QQEW, VTHR)]  0.348418
552149     [(AXFN, GRID)]  0.352711
1276784     [(CSQ, VTWO)]  0.354989
3218140    [(VGSH, VMBS)]  0.358162
10199      [(AAXJ, ADRA)]  0.375360
552302     [(AXFN, IFSM)]  0.377309
3222020    [(VONG, VTHR)]  0.378349
84246      [(ACWX, FUND)]  0.380259
3222166    [(VONV, VTWG)]  0.389194
83367      [(ACWX, ADRD)]  0.392960
1276758     [(CSQ, VONV)]  0.392992
1322003    [(CUBA, VTWV)]  0.397307
3009231    [(QQEW, VONE)]  0.399292
1042134     [(CHW, PRFZ)]  0.403097
1276783     [(CSQ, VTWG)]  0.411294
2858139    [(OTTR, TCRD)]  0.413924
2627678     [(MGF, YORW)]  0.416530
110052     [(ADRD, PAGG)]  0.421563
3009233    [(QQEW, VONV)]  0.425668
1787861     [(FDV, VONV)]  0.430986
2945343    [(PNQI, QQXT)]  0.431276
1797179    [(FFBC, NBTB)]  0.436099
3091438  [(SENEA, SENEB)]  0.440247
3011915    [(QQQX, VTHR)]  0.440720
2627530     [(MGF, VCSH)]  0.445798
2687539    [(MSEX, VGSH)]  0.450024
3011895    [(QQQX, VONV)]  0.451862
82617      [(ACWI, PRFZ)]  0.455715
1782730     [(FDI, VGIT)]  0.458783
2033379    [(GRID, IFSM)]  0.459515
2627535     [(MGF, VGSH)]  0.463582
553624     [(AXFN, WOOD)]  0.477062
                      ...       ...

[100 rows x 2 columns]

In [9]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in smallssd.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  #Make the data labels on the x-axis vertical
plt.show()

In [12]:
# Time series plot of average gap in standardized price

plt.figure(2)
i = 1
for a in smallssd.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

Co-Integration (ADF) Test


In [6]:
# Parallel processing for cointegration test:
# Note - this test will sometimes hang depending on system resources
# If it does, restart notebook and run this test prior to distance or correlation

def cointegration_test(y, x):
        ctresult = stat.OLS(y, x).fit()
        return(ts.adfuller(ctresult.resid))


def coint(cointTrainingTlist):
    try:
        cointTrainingXname = cointTrainingTlist[0]
        cointTrainingYname = cointTrainingTlist[1]
        trainCointX = trainData[cointTrainingXname]
        trainCointY = trainData[cointTrainingYname]
        if min(trainCointX.count(), trainCointY.count()) > 495:
            trainxclean = trainCointX[trainCointX.notnull() & trainCointY.notnull()]
            trainyclean = trainCointY[trainCointX.notnull() & trainCointY.notnull()]
            trainxp = list(trainxclean)
            trainyp = list(trainyclean)
            return([(cointTrainingXname, cointTrainingYname)], cointegration_test(trainxp,trainyp)[1]) # Get the p-value of test for each pair
        else:
            return()
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    trainCointPool = Pool(processes=4)
    # Test just the first 100 pairs - remove [0:100] for full test
    trainCointResult = pd.DataFrame(trainCointPool.map(coint, tickerpairs[0:100]))

trainCointPool.close()


                 0         1
48  [(AAME, ADVS)]  0.000122
61  [(AAME, AFOP)]  0.000218
16  [(AAME, ACET)]  0.009350
4   [(AAME, ABAX)]  0.014371
22  [(AAME, ACIW)]  0.017331

[5 rows x 2 columns]
# Run only when analyzing full data set to output training data set in pickle format # Change cell from raw text to code to run trainCointOutput = open('trainCointResult.pkl', 'wb') pkl.dump(trainCointResult, trainCointOutput, -1) trainCointOutput.close()
# Save a copy for github - only run once when analyzing full data set # Change cell from raw text to code to run !gzip -k trainCointResult.pkl

In [11]:
# Unzip github file
!gunzip -c trainCointResult.pkl.gz > trainCointResult.pkl

In [12]:
# Load results from pkl file
trainCointInput = open('trainCointResult.pkl', 'rb')
trainCointResult = pkl.load(trainCointInput)
trainCointInput.close()

In [14]:
smallCoint = trainCointResult.sort(columns = 1)[0:100]
print smallCoint.head()


                        0             1
2876609    [(PATR, RDIB)]  0.000000e+00
2483677  [(LINTA, LINTB)]  0.000000e+00
124437     [(ADVS, HUBG)]  6.877388e-29
3091438  [(SENEA, SENEB)]  1.191406e-25
2876488    [(PATR, PNBK)]  1.387992e-25

[5 rows x 2 columns]

Controlling the false discovery rate: Benjamini–Hochberg procedure

# NOTE: Still too many pairs, we arbitrarily examine the top 100. # Not in use, but to run change cell from raw text to code # Control for false discovery rate using Benjamani Hockberg procedure (From Wikipedia): # Assume there are m hypothesis tests. # Order the p-values in increasing order and call them P_(1),P_(2),....,P_(m) # Then the steps for the procedure are as follows: # 1) For a given alpha, find the largest k such that P_(k) <= (k/m)*alpha # 2) Then reject all H_(i) for i = 1,2,...,k from __future__ import division #need this bc python cannot do division for integers properly without it Cointp = trainCointResult.sort(columns = 1) cointpvalues = list(Cointp[Cointp.columns[1]]) m = len(cointpvalues) alpha = 0.01 # False Discovery Rate (20% is used, Can tweak this if necessary) k = 0 while cointpvalues[k] <= ((k+1)/m)*alpha: # Obtain the k from step 1) k += 1 print k CointSaved = Cointp[:k] print CointSaved.head() # Significant pairs under BH procedure

In [23]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(3)
i = 1
for a in smallCoint.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   # Make the data labels on the x-axis vertical
plt.show()

In [7]:
# Time series plot of standardized gap in stock price

plt.figure(4)
i = 1
for a in smallCoint.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    # Make the data labels on the x-axis vertical
plt.show()

Correlation


In [58]:
# Parallel processing for correlation test
# Consider updating to take advantage of pandas.DataFrame.corr() but would need to change handling of output

def correlate(tlist):
    try:
        xname = tlist[0]
        yname = tlist[1]
        x = trainData[xname]
        y = trainData[yname]
        if min(x.count(), y.count()) > 490:
            corrs = x.corr(y)
            return([(xname, yname)], corrs)
        else:
            return()
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    trainCorrPool = Pool(processes=4)
    # Test just the first 100 pairs - remove [0:100] for full test
    trainCorrResult = pd.DataFrame(trainCorrPool.map(correlate, tickerpairs))
    
trainCorrPool.close()
# Run when analyzing full data set only to output training data set in pickle format # Change cell from raw text to code to run trainCorrOutput = open('trainCorrResult.pkl', 'wb') pkl.dump(trainCorrResult, trainCorrOutput, -1) trainCorrOutput.close() pairscorrelated = trainCorrResult.sort(columns = 1, ascending=False)[0:5] print pairscorrelated
# Save a copy for github - only run once when analyzing full data set # Change cell from raw text to code to run !gzip -k trainCorrResult.pkl

In [15]:
# Unzip github file
!gunzip -c trainCorrResult.pkl.gz > trainCorrResult.pkl

In [16]:
#Read results from pkl file
trainCorrInput = open('trainCorrResult.pkl', 'rb')
trainCorrResult = pkl.load(trainCorrInput)
trainCorrInput.close()


pairscorrelated = trainCorrResult.sort(columns = 1, ascending=False)[0:5]
print pairscorrelated


            0   1
1616076  None NaN
1847483  None NaN
1847418  None NaN
1847421  None NaN
1847431  None NaN

[5 rows x 2 columns]

In [17]:
trainCorrResult.columns = ['pair','corr']
trainCorrTop = trainCorrResult.dropna().sort(columns='corr', ascending=False)[0:100]
trainCorrTop.head()


Out[17]:
pair corr
3221878 [(VONE, VTHR)] 0.998255
2459008 [(LBTYA, LBTYK)] 0.998026
3178525 [(TECUA, TECUB)] 0.996770
1101073 [(CMCSA, CMCSK)] 0.995945
3052453 [(ROIA, ROIAK)] 0.995695

5 rows × 2 columns


In [30]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(5)
i = 1
for a in trainCorrTop.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   # Make the data labels on the x-axis vertical
plt.show()

In [10]:
# Time series plot of standardized gap in stock prices

plt.figure(6)
i = 1
for a in trainCorrTop.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = trainData[stock1]/(trainData[stock1][0])
  pairsprice2 = trainData[stock2]/(trainData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1 +' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    # Make the data labels on the x-axis vertical
plt.show()
# Histogram of the Correlation of the stock pairs # Not currently in use # Change cell from raw text to code to run correls = list(trainCorrResult[trainCorrResult.columns[1]]) categories = ['< 0.0','0.0-0.1','0.1-0.2','0.2-0.3','0.3-0.4','0.4-0.5','0.5-0.6','0.6-0.7','0.7-0.8','0.8-0.9','0.9-1.0'] freq0 = len([cor for cor in correls if cor < 0.0]) freq1 = len([cor for cor in correls if 0.0 <= cor < 0.1]) freq2 = len([cor for cor in correls if 0.1 <= cor < 0.2]) freq3 = len([cor for cor in correls if 0.2 <= cor < 0.3]) freq4 = len([cor for cor in correls if 0.3 <= cor < 0.4]) freq5 = len([cor for cor in correls if 0.4 <= cor < 0.5]) freq6 = len([cor for cor in correls if 0.5 <= cor < 0.6]) freq7 = len([cor for cor in correls if 0.6 <= cor < 0.7]) freq8 = len([cor for cor in correls if 0.7 <= cor < 0.8]) freq9 = len([cor for cor in correls if 0.8 <= cor < 0.9]) freq10 = len([cor for cor in correls if 0.9 <= cor < 1.0]) cat = np.arange(len(categories)) frequencies = [freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7,freq8,freq9,freq10] width = 1.0 # gives histogram aspect to the bar diagram ax = plt.axes() ax.set_xticks(cat + (width / 2)) ax.set_xticklabels(categories) plt.bar(cat,frequencies,width) plt.xlabel('Correlation Ranges') plt.ylabel('Frequencies') plt.title('Histogram of Correlation of Stock Pairs') plt.show()

Test Data Set

Distance Method


In [18]:
# Parallel processing for mean sum-square-distance

def dist(tlist):
    xname = tickerpairs[tlist][0]
    yname = tickerpairs[tlist][1]
    x = testData[xname]/(testData[xname][0])
    y = testData[yname]/(testData[yname][0])
    # Remove any NA values
    z = (x-y).dropna()
    # Only consider pairs with most of data available
    if len(z) > 248:
        return([(xname, yname)], sum(map(lambda z:z**2, z)))
    else:
        return()

if __name__ == '__main__':
    testDistPool = Pool(processes=4)
    # Tests just the number of pairs selected in smallssd
    testDistResult = pd.DataFrame(testDistPool.map(dist, smallssd.index))

testDistPool.close()

In [19]:
plt.scatter(smallssd[1], testDistResult[1])
plt.vlines(.3,-2,15, colors='r')
plt.text(.31,11,'x = .3')
plt.xlabel('Training SSD')
plt.ylabel('Test SSD')
plt.title('Consistency of SSD Method') 
plt.show()

In [47]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in smallssd.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  #M ake the data labels on the x-axis vertical
plt.show()

In [48]:
#Time series plot of average gap in standardized price

plt.figure(2)
i = 1
for a in smallssd.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  #Updated to reflect standardized price differential
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    # Make the data labels on the x-axis vertical
plt.show()

Cointegration


In [9]:
# Parallel processing for cointegration test:

def cointegration_test(y, x):
        result = stat.OLS(y, x).fit()
        return(ts.adfuller(result.resid))


def coint(cointTestingTlist):
    try:
        cointTestingXname = tickerpairs[cointTestingTlist][0]
        cointTestingYname = tickerpairs[cointTestingTlist][1]
        testCointX = testData[cointTestingXname]
        testCointY = testData[cointTestingYname]
        if min(testCointX.count(), testCointY.count()) > 248:
            testxclean = testCointX[testCointX.notnull() & testCointY.notnull()]
            testyclean = testCointY[testCointX.notnull() & testCointY.notnull()]
            testxp = list(testxclean)
            testyp = list(testyclean)
            return([(cointTestingXname, cointTestingYname)], cointegration_test(testxp,testyp)[1]) # Get the p-value of test for each pair
        else:
            return()
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    testCointPool = Pool(processes=4)
    # Test just the first 100 pairs - remove [0:100] for full test
    testCointResult = pd.DataFrame(testCointPool.map(coint, smallCoint.index))

testCointPool.close()
testsmallCoint = testCointResult.sort(columns = 1)[0:5]
print testsmallCoint


                  0             1
1  [(LINTA, LINTB)]  1.231724e-26
3  [(SENEA, SENEB)]  1.257753e-08
0    [(PATR, RDIB)]  1.496377e-01
2    [(ADVS, HUBG)]  4.528192e-01
4    [(PATR, PNBK)]  9.694963e-01

[5 rows x 2 columns]

In [49]:
plt.scatter(smallCoint[1][0:5], testCointResult[1][0:5])
plt.xlim(-1e-25,1e-24)
plt.xlabel('Training Cointegration')
plt.ylabel('Test Cointegration')
plt.title('Consistency of Cointegration Method') 
plt.show()

In [46]:
# Note scale of training p-values
smallCoint.head()


Out[46]:
0 1
2876609 [(PATR, RDIB)] 0.000000e+00
2483677 [(LINTA, LINTB)] 0.000000e+00
124437 [(ADVS, HUBG)] 6.877388e-29
3091438 [(SENEA, SENEB)] 1.191406e-25
2876488 [(PATR, PNBK)] 1.387992e-25

5 rows × 2 columns


In [31]:
# Note same pairs test p-values - 3 of 5 are NOT SIGNIFICANT
testCointResult.head()


Out[31]:
0 1
0 [(PATR, RDIB)] 1.496377e-01
1 [(LINTA, LINTB)] 1.231724e-26
2 [(ADVS, HUBG)] 4.528192e-01
3 [(SENEA, SENEB)] 1.257753e-08
4 [(PATR, PNBK)] 9.694963e-01

5 rows × 2 columns


In [25]:
# Concatenate the cointegration results for further evaluation

df = pd.DataFrame(smallCoint.reset_index())
df['2'] = testCointResult[1]
df.dropna(axis=0,how='any')
df.columns = ['index','pair','train','test']
df.head()


Out[25]:
index pair train test
0 2876609 [(PATR, RDIB)] 0.000000e+00 1.496377e-01
1 2483677 [(LINTA, LINTB)] 0.000000e+00 1.231724e-26
2 124437 [(ADVS, HUBG)] 6.877388e-29 4.528192e-01
3 3091438 [(SENEA, SENEB)] 1.191406e-25 1.257753e-08
4 2876488 [(PATR, PNBK)] 1.387992e-25 9.694963e-01

5 rows × 4 columns


In [30]:
# Note that training data box plot does not even appear due to scale issues - the training p-values are so minute

bp = df.boxplot(column=['train','test'])
plt.show()

In [19]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in smallCoint.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  # Make the data labels on the x-axis vertical
plt.show()

In [11]:
# Time series plot of average gap in standardized price

plt.figure(2)
i = 1
for a in smallCoint.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  #Updated to reflect standardized price differential
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    # Make the data labels on the x-axis vertical
plt.show()

Correlation


In [30]:
# Parallel processing for correlation test
# Consider updating to take advantage of pandas.DataFrame.corr() but would need to change handling of output

def correlate(tlist):
    try:
        xname = tickerpairs[tlist][0]
        yname = tickerpairs[tlist][1]
        x = testData[xname]
        y = testData[yname]
        if min(x.count(), y.count()) > 240:
            corrs = x.corr(y)
            return([(xname, yname)], corrs)
        else:
            return()
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    testCorrPool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    testCorrResult = pd.DataFrame(testCorrPool.map(correlate, trainCorrTop.index))
    
testCorrPool.close()

In [38]:
testCorrResult.columns = ['pair','corr']
plt.scatter(trainCorrTop['corr'], testCorrResult['corr'])
plt.vlines(.993,-1,1, colors='r')
plt.text(.994,0,'x = .993')
plt.xlabel('Training Correlation')
plt.ylabel('Test Correlation')
plt.title('Consistency of Correlation Method') 
plt.show()

In [16]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in trainCorrTop.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  plt.subplot(5,1,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  # Make the data labels on the x-axis vertical
plt.show()

In [17]:
#Time series plot of average gap in standardized price

plt.figure(2)
i = 1
for a in trainCorrTop.index[0:5]:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  #Updated to reflect standardized price differential
  pairsprice1 = testData[stock1]/(testData[stock1][0])
  pairsprice2 = testData[stock2]/(testData[stock2][0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(5,1,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    # Make the data labels on the x-axis vertical
plt.show()

In [ ]: