Stat222 Capstone Project - Pairs Trading

Import required libraries



In [1]:

    
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool

Using manually downloaded WRDS (Wharton) data



In [3]:

    
# Unzip and import sorted stock data - currently using test file to manage load times
!gunzip -c testdata.csv.gz | head -n1 > testsort.csv
!gunzip -c testdata.csv.gz | tail -n +2 | sort -t, -k2,2 >> testsort.csv
# Use absolute values to clean up incorrect negative values
wrdsData = pd.io.parsers.read_csv("testsort.csv", sep = ",", index_col=[0,1], parse_dates=0, usecols = [1,2,3]).abs()









    



gunzip: error writing to output: Broken pipe
gunzip: testdata.csv.gz: uncompress failed



In [4]:

    
# Known issue with usecols is wrong index names
wrdsData.index.names = ['date', 'ticker']
wrdsData.head()









    Out[4]:






  
    
      
      
      PRC
    
    
      date
      ticker
      
    
  
  
    
      2011-01-03
      AEPI
       26.510
    
    
      JJSF
       49.440
    
    
      PLXS
       31.770
    
    
      RMCF
        9.861
    
    
      MSFT
       27.980
    
  

5 rows × 1 columns



In [5]:

    
# Test selection of prices by ticker symbol
wrdsData.xs('MSFT', level='ticker').head(5)









    Out[5]:






  
    
      
      PRC
    
    
      date
      
    
  
  
    
      2011-01-03
       27.9800
    
    
      2011-01-04
       28.0875
    
    
      2011-01-05
       28.0000
    
    
      2011-01-06
       28.8200
    
    
      2011-01-07
       28.6000
    
  

5 rows × 1 columns



In [24]:

    
#Create training and testing data sets
#trainData = wrdsData.ix['20110101':'20121231']
#testData = wrdsData.ix['20130101':'20131231']

How many pairs are there?



In [6]:

    
!gunzip -c testdata.csv.gz | tail -n +2 | cut -f 2 > tickerlist.csv
tickers = pd.io.parsers.read_csv("tickerlist.csv", sep = ",",header=None)
tickerlist = sorted(list(set(tickers[2])))[1:]
tickerpairs = list(combinations(tickerlist,2))
print tickerpairs[:10] #look at first 10 combinations of pairs
print len(tickerpairs) #over 2.5 million possible pairs, too much data to analyze?









    



[('AAME', 'AAON'), ('AAME', 'AAPL'), ('AAME', 'AAWW'), ('AAME', 'AAXJ'), ('AAME', 'ABAX'), ('AAME', 'ABCB'), ('AAME', 'ABCD'), ('AAME', 'ABCO'), ('AAME', 'ABFS'), ('AAME', 'ABIO')]
2650753

Distance Method



In [7]:

    
# Parallel processing for sum-square-distance
def dist(tlist):
    xname = tlist[0]
    yname = tlist[1]
    x = wrdsData.xs(xname, level='ticker').PRC/(wrdsData.xs(xname, level='ticker').PRC[0])
    y = wrdsData.xs(yname, level='ticker').PRC/(wrdsData.xs(yname, level='ticker').PRC[0])
    return([(xname, yname)], sum(map(lambda z:z**2, x-y)))

if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    result = pd.DataFrame(pool.map(dist, tickerpairs[0:100]))

pool.close()
smallssd = result.sort(columns = 1)[0:5]
print smallssd









    



                 0         1
23  [(AAME, ACNB)]  0.117484
31  [(AAME, ACWX)]  0.145515
30  [(AAME, ACWI)]  0.153148
84  [(AAME, ALOT)]  0.192313
40  [(AAME, ADRD)]  0.215263

[5 rows x 2 columns]



In [7]:

    
# Time series plot of standardized stock prices
# Updated to use parallel processing output

plt.figure(1)
i = 1
for a in smallssd.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Standardized Price')
  plt.title('Time Series Plot of '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)  #Make the data labels on the x-axis vertical
plt.show()



In [8]:

    
#Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallssd.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  #Updated to reflect standardized price differential
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Price')
  plt.title('Gap Price between '+stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

Co-Integration (ADF) Test



In [8]:

    
# Parallel processing for cointegration test:
def cointegration_test(y, x):
        result = stat.OLS(y, x).fit()
        return(ts.adfuller(result.resid))


def coint(tlist):
    try:
        xname = tlist[0]
        yname = tlist[1]
        x = wrdsData.xs(xname, level='ticker').PRC
        y = wrdsData.xs(yname, level='ticker').PRC
        xclean = x[x.notnull() & y.notnull()]
        yclean = y[x.notnull() & y.notnull()]
        xp = list(xclean)
        yp = list(yclean)
        return([(xname, yname)], cointegration_test(xp,yp)[1]) #get the p-value of test for each pair
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    cointResult = pd.DataFrame(pool.map(coint, tickerpairs[0:100]))

pool.close()
smallCoint = cointResult.sort(columns = 1)[0:5]
print smallCoint









    



                 0         1
75  [(AAME, ALCO)]  0.001548
90  [(AAME, AMAG)]  0.001574
8   [(AAME, ABFS)]  0.001704
70  [(AAME, AIRT)]  0.001790
18  [(AAME, ACFN)]  0.004354

[5 rows x 2 columns]

Controlling the false discovery rate: Benjamini–Hochberg procedure



In [9]:

    
#Control for false discovery rate using Benjamani Hockberg procedure (From Wikipedia):
#Assume there are m hypothesis tests.
#Order the p-values in increasing order and call them P_(1),P_(2),....,P_(m)
#Then the steps for the procedure are as follows:
#1) For a given alpha, find the largest k such that P_(k) <= (k/m)*alpha
#2) Then reject all H_(i) for i = 1,2,...,k

from __future__ import division  #need this bc python cannot do division for integers properly without it 
Cointp = cointResult.sort(columns = 1)
cointpvalues = list(Cointp[Cointp.columns[1]])
m = len(cointpvalues)
alpha = 0.2  #False Discovery Rate (20% is used, Can tweak this if necessary)

k = 0
while cointpvalues[k] <= ((k+1)/m)*alpha:  #Obtain the k from step 1)
    k += 1

print k
print Cointp[:k] #significant pairs under BH procedure









    



17
                 0         1
75  [(AAME, ALCO)]  0.001548
90  [(AAME, AMAG)]  0.001574
8   [(AAME, ABFS)]  0.001704
70  [(AAME, AIRT)]  0.001790
18  [(AAME, ACFN)]  0.004354
34  [(AAME, ADBE)]  0.007413
39  [(AAME, ADRA)]  0.010417
1   [(AAME, AAPL)]  0.010599
80  [(AAME, ALLB)]  0.012542
49  [(AAME, AEIS)]  0.013229
6   [(AAME, ABCD)]  0.013351
41  [(AAME, ADRE)]  0.014833
25  [(AAME, ACPW)]  0.016124
46  [(AAME, ADVS)]  0.017627
5   [(AAME, ABCB)]  0.021127
68   [(AAME, AIQ)]  0.025048
14  [(AAME, ACAT)]  0.026387

[17 rows x 2 columns]



In [8]:

    
#Plot for 
plt.figure(3)
i = 1
for a in smallCoint.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   #Make the data labels on the x-axis vertical
plt.show()



In [9]:

    
plt.figure(4)
i = 1
for a in smallCoint.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1+' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()

Correlation



In [10]:

    
def correlate(tlist):
    try:
        xname = tlist[0]
        yname = tlist[1]
        corrs = np.corrcoef(wrdsData.xs(xname,level='ticker').PRC,wrdsData.xs(yname, level='ticker').PRC)[0,1]
        return([(xname, yname)], corrs)
    except ValueError:
        return()
    except TypeError:
        return()


if __name__ == '__main__':
    pool = Pool(processes=4)
    #Test just the first 100 pairs - remove [0:100] for full test
    corrpairs = pd.DataFrame(pool.map(correlate, tickerpairs[0:100]))
    
pool.close()
pairscorrelated = corrpairs.sort(columns = 1, ascending=False)[1:6]
print pairscorrelated









    



                 0         1
92  [(AAME, AMAT)]  0.575924
99  [(AAME, AMKR)]  0.545626
16  [(AAME, ACET)]  0.533371
72  [(AAME, AIXG)]  0.503373
95  [(AAME, AMCN)]  0.495158

[5 rows x 2 columns]



In [11]:

    
plt.figure(5)
i = 1
for a in pairscorrelated.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  plt.subplot(1,5,i)
  plt.plot_date(pairsprice1.index, pairsprice1,'r')
  plt.plot_date(pairsprice1.index, pairsprice2,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Stock Price')
  plt.title(stock1+' vs '+stock2) 
  i += 1

plt.gcf().autofmt_xdate(rotation=90)   #Make the data labels on the x-axis vertical
plt.show()



In [12]:

    
plt.figure(6)
i = 1
for a in pairscorrelated.index:
  stock1 = tickerpairs[a][0]
  stock2 = tickerpairs[a][1]
  pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
  pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
  pairsgap = pairsprice1-pairsprice2
  plt.subplot(1,5,i)
  plt.plot(pairsprice1.index, pairsgap,'b')
  plt.legend([stock1,stock2])
  plt.xlabel('Date')
  plt.ylabel('Gap Price')
  plt.title(stock1 +' vs '+stock2)
  i += 1

plt.gcf().autofmt_xdate(rotation=90)    #Make the data labels on the x-axis vertical
plt.show()



In [10]:

    
#Histogram of the Correlation of the stock pairs

correls = list(corrpairs[corrpairs.columns[1]])
categories = ['< 0.0','0.0-0.1','0.1-0.2','0.2-0.3','0.3-0.4','0.4-0.5','0.5-0.6','0.6-0.7','0.7-0.8','0.8-0.9','0.9-1.0']

freq0 = len([cor for cor in correls if cor < 0.0])
freq1 = len([cor for cor in correls if 0.0 <= cor < 0.1])
freq2 = len([cor for cor in correls if 0.1 <= cor < 0.2])
freq3 = len([cor for cor in correls if 0.2 <= cor < 0.3])
freq4 = len([cor for cor in correls if 0.3 <= cor < 0.4])
freq5 = len([cor for cor in correls if 0.4 <= cor < 0.5])
freq6 = len([cor for cor in correls if 0.5 <= cor < 0.6])
freq7 = len([cor for cor in correls if 0.6 <= cor < 0.7])
freq8 = len([cor for cor in correls if 0.7 <= cor < 0.8])
freq9 = len([cor for cor in correls if 0.8 <= cor < 0.9])
freq10 = len([cor for cor in correls if 0.9 <= cor < 1.0])

cat = np.arange(len(categories))
frequencies = [freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7,freq8,freq9,freq10]

width = 1.0     # gives histogram aspect to the bar diagram

ax = plt.axes()
ax.set_xticks(cat + (width / 2))
ax.set_xticklabels(categories)

plt.bar(cat,frequencies,width)
plt.xlabel('Correlation Ranges')
plt.ylabel('Frequencies')
plt.title('Histogram of Correlation of Stock Pairs') 
plt.show()



In [10]:



In [ ]:

		PRC
date	ticker
2011-01-03	AEPI	26.510
	JJSF	49.440
	PLXS	31.770
	RMCF	9.861
	MSFT	27.980

	PRC
date
2011-01-03	27.9800
2011-01-04	28.0875
2011-01-05	28.0000
2011-01-06	28.8200
2011-01-07	28.6000