Stat222 Capstone Project - Pairs Trading

Import required libraries


In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool

Using manually downloaded WRDS (Wharton) data


In [3]:
# Unzip and import sorted stock data - currently using test file to manage load times
!gunzip -c testdata.csv.gz | head -n1 > testsort.csv
!gunzip -c testdata.csv.gz | tail -n +2 | sort -t, -k2,2 >> testsort.csv
# Use absolute values to clean up incorrect negative values
wrdsData = pd.io.parsers.read_csv("testsort.csv", sep = ",", index_col=[0,1], parse_dates=0, usecols = [1,2,3]).abs()


gunzip: error writing to output: Broken pipe
gunzip: testdata.csv.gz: uncompress failed

In [4]:
# Known issue with usecols is wrong index names
wrdsData.index.names = ['date', 'ticker']
wrdsData.head()


Out[4]:
PRC
date ticker
2011-01-03 AEPI 26.510
JJSF 49.440
PLXS 31.770
RMCF 9.861
MSFT 27.980

5 rows × 1 columns


In [5]:
# Test selection of prices by ticker symbol
wrdsData.xs('MSFT', level='ticker').head(5)


Out[5]:
PRC
date
2011-01-03 27.9800
2011-01-04 28.0875
2011-01-05 28.0000
2011-01-06 28.8200
2011-01-07 28.6000

5 rows × 1 columns


In [24]:
#Create training and testing data sets
#trainData = wrdsData.ix['20110101':'20121231']
#testData = wrdsData.ix['20130101':'20131231']

How many pairs are there?


In [6]:
!gunzip -c testdata.csv.gz | tail -n +2 | cut -f 2 > tickerlist.csv
tickers = pd.io.parsers.read_csv("tickerlist.csv", sep = ",",header=None)
tickerlist = sorted(list(set(tickers[2])))[1:]
tickerpairs = list(combinations(tickerlist,2))
print tickerpairs[:10] #look at first 10 combinations of pairs
print len(tickerpairs) #over 2.5 million possible pairs, too much data to analyze?


[('AAME', 'AAON'), ('AAME', 'AAPL'), ('AAME', 'AAWW'), ('AAME', 'AAXJ'), ('AAME', 'ABAX'), ('AAME', 'ABCB'), ('AAME', 'ABCD'), ('AAME', 'ABCO'), ('AAME', 'ABFS'), ('AAME', 'ABIO')]
2650753

Distance Method


In [7]:
# Parallel processing for sum-square-distance
def dist(tlist):
    xname = tlist[0]
    yname = tlist[1]
    x = wrdsData.xs(xname, level='ticker').PRC/(wrdsData.xs(xname, level='ticker').PRC[0])
    y = wrdsData.xs(yname, level='ticker').PRC/(wrdsData.xs(yname, level='ticker').PRC[0])
    return([(xname, yname)], sum(map(lambda z:z**2, x-y)))

if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    result = pd.DataFrame(pool.map(dist, tickerpairs[0:100]))

pool.close()
smallssd = result.sort(columns = 1)[0:5]
print smallssd


                 0         1
23  [(AAME, ACNB)]  0.117484
31  [(AAME, ACWX)]  0.145515
30  [(AAME, ACWI)]  0.153148
84  [(AAME, ALOT)]  0.192313
40  [(AAME, ADRD)]  0.215263

[5 rows x 2 columns]

In [7]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in smallssd.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  #Make the data labels on the x-axis vertical
plt.show()

In [8]:
#Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallssd.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  #Updated to reflect standardized price differential
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

Co-Integration (ADF) Test


In [8]:
# Parallel processing for cointegration test:
def cointegration_test(y, x):
        result = stat.OLS(y, x).fit()
        return(ts.adfuller(result.resid))


def coint(tlist):
    try:
        xname = tlist[0]
        yname = tlist[1]
        x = wrdsData.xs(xname, level='ticker').PRC
        y = wrdsData.xs(yname, level='ticker').PRC
        xclean = x[x.notnull() & y.notnull()]
        yclean = y[x.notnull() & y.notnull()]
        xp = list(xclean)
        yp = list(yclean)
        return([(xname, yname)], cointegration_test(xp,yp)[1]) #get the p-value of test for each pair
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    cointResult = pd.DataFrame(pool.map(coint, tickerpairs[0:100]))

pool.close()
smallCoint = cointResult.sort(columns = 1)[0:5]
print smallCoint


                 0         1
75  [(AAME, ALCO)]  0.001548
90  [(AAME, AMAG)]  0.001574
8   [(AAME, ABFS)]  0.001704
70  [(AAME, AIRT)]  0.001790
18  [(AAME, ACFN)]  0.004354

[5 rows x 2 columns]

Controlling the false discovery rate: Benjamini–Hochberg procedure


In [9]:
#Control for false discovery rate using Benjamani Hockberg procedure (From Wikipedia):
#Assume there are m hypothesis tests.
#Order the p-values in increasing order and call them P_(1),P_(2),....,P_(m)
#Then the steps for the procedure are as follows:
#1) For a given alpha, find the largest k such that P_(k) <= (k/m)*alpha
#2) Then reject all H_(i) for i = 1,2,...,k

from __future__ import division  #need this bc python cannot do division for integers properly without it 
Cointp = cointResult.sort(columns = 1)
cointpvalues = list(Cointp[Cointp.columns[1]])
m = len(cointpvalues)
alpha = 0.2  #False Discovery Rate (20% is used, Can tweak this if necessary)

k = 0
while cointpvalues[k] <= ((k+1)/m)*alpha:  #Obtain the k from step 1)
    k += 1

print k
print Cointp[:k] #significant pairs under BH procedure


17
                 0         1
75  [(AAME, ALCO)]  0.001548
90  [(AAME, AMAG)]  0.001574
8   [(AAME, ABFS)]  0.001704
70  [(AAME, AIRT)]  0.001790
18  [(AAME, ACFN)]  0.004354
34  [(AAME, ADBE)]  0.007413
39  [(AAME, ADRA)]  0.010417
1   [(AAME, AAPL)]  0.010599
80  [(AAME, ALLB)]  0.012542
49  [(AAME, AEIS)]  0.013229
6   [(AAME, ABCD)]  0.013351
41  [(AAME, ADRE)]  0.014833
25  [(AAME, ACPW)]  0.016124
46  [(AAME, ADVS)]  0.017627
5   [(AAME, ABCB)]  0.021127
68   [(AAME, AIQ)]  0.025048
14  [(AAME, ACAT)]  0.026387

[17 rows x 2 columns]

In [8]:
#Plot for 
plt.figure(3)
i = 1
for a in smallCoint.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   #Make the data labels on the x-axis vertical
plt.show()

In [9]:
plt.figure(4)
i = 1
for a in smallCoint.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

Correlation


In [10]:
def correlate(tlist):
    try:
        xname = tlist[0]
        yname = tlist[1]
        corrs = np.corrcoef(wrdsData.xs(xname,level='ticker').PRC,wrdsData.xs(yname, level='ticker').PRC)[0,1]
        return([(xname, yname)], corrs)
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    corrpairs = pd.DataFrame(pool.map(correlate, tickerpairs[0:100]))
    
pool.close()
pairscorrelated = corrpairs.sort(columns = 1, ascending=False)[1:6]
print pairscorrelated


                 0         1
92  [(AAME, AMAT)]  0.575924
99  [(AAME, AMKR)]  0.545626
16  [(AAME, ACET)]  0.533371
72  [(AAME, AIXG)]  0.503373
95  [(AAME, AMCN)]  0.495158

[5 rows x 2 columns]

In [11]:
plt.figure(5)
i = 1
for a in pairscorrelated.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   #Make the data labels on the x-axis vertical
plt.show()

In [12]:
plt.figure(6)
i = 1
for a in pairscorrelated.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1 +' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

In [10]:
#Histogram of the Correlation of the stock pairs

correls = list(corrpairs[corrpairs.columns[1]])
categories = ['< 0.0','0.0-0.1','0.1-0.2','0.2-0.3','0.3-0.4','0.4-0.5','0.5-0.6','0.6-0.7','0.7-0.8','0.8-0.9','0.9-1.0']

freq0 = len([cor for cor in correls if cor < 0.0])
freq1 = len([cor for cor in correls if 0.0 <= cor < 0.1])
freq2 = len([cor for cor in correls if 0.1 <= cor < 0.2])
freq3 = len([cor for cor in correls if 0.2 <= cor < 0.3])
freq4 = len([cor for cor in correls if 0.3 <= cor < 0.4])
freq5 = len([cor for cor in correls if 0.4 <= cor < 0.5])
freq6 = len([cor for cor in correls if 0.5 <= cor < 0.6])
freq7 = len([cor for cor in correls if 0.6 <= cor < 0.7])
freq8 = len([cor for cor in correls if 0.7 <= cor < 0.8])
freq9 = len([cor for cor in correls if 0.8 <= cor < 0.9])
freq10 = len([cor for cor in correls if 0.9 <= cor < 1.0])

cat = np.arange(len(categories))
frequencies = [freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7,freq8,freq9,freq10]

width = 1.0     # gives histogram aspect to the bar diagram

ax = plt.axes()
ax.set_xticks(cat + (width / 2))
ax.set_xticklabels(categories)

plt.bar(cat,frequencies,width)
plt.xlabel('Correlation Ranges')
plt.ylabel('Frequencies')
plt.title('Histogram of Correlation of Stock Pairs') 
plt.show()

In [10]:


In [ ]: