In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool
In [3]:
# Unzip and import sorted stock data - currently using test file to manage load times
!gunzip -c testdata.csv.gz | head -n1 > testsort.csv
!gunzip -c testdata.csv.gz | tail -n +2 | sort -t, -k2,2 >> testsort.csv
# Use absolute values to clean up incorrect negative values
wrdsData = pd.io.parsers.read_csv("testsort.csv", sep = ",", index_col=[0,1], parse_dates=0, usecols = [1,2,3]).abs()
In [4]:
# Known issue with usecols is wrong index names
wrdsData.index.names = ['date', 'ticker']
wrdsData.head()
Out[4]:
In [5]:
# Test selection of prices by ticker symbol
wrdsData.xs('MSFT', level='ticker').head(5)
Out[5]:
In [24]:
#Create training and testing data sets
#trainData = wrdsData.ix['20110101':'20121231']
#testData = wrdsData.ix['20130101':'20131231']
In [6]:
!gunzip -c testdata.csv.gz | tail -n +2 | cut -f 2 > tickerlist.csv
tickers = pd.io.parsers.read_csv("tickerlist.csv", sep = ",",header=None)
tickerlist = sorted(list(set(tickers[2])))[1:]
tickerpairs = list(combinations(tickerlist,2))
print tickerpairs[:10] #look at first 10 combinations of pairs
print len(tickerpairs) #over 2.5 million possible pairs, too much data to analyze?
In [7]:
# Parallel processing for sum-square-distance
def dist(tlist):
xname = tlist[0]
yname = tlist[1]
x = wrdsData.xs(xname, level='ticker').PRC/(wrdsData.xs(xname, level='ticker').PRC[0])
y = wrdsData.xs(yname, level='ticker').PRC/(wrdsData.xs(yname, level='ticker').PRC[0])
return([(xname, yname)], sum(map(lambda z:z**2, x-y)))
if __name__ == '__main__':
pool = Pool(processes=4)
#Test just the first 100 pairs - remove [0:100] for full test
result = pd.DataFrame(pool.map(dist, tickerpairs[0:100]))
pool.close()
smallssd = result.sort(columns = 1)[0:5]
print smallssd
In [7]:
# Time series plot of standardized stock prices
# Updated to use parallel processing output
plt.figure(1)
i = 1
for a in smallssd.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
plt.subplot(1,5,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Standardized Price')
plt.title('Time Series Plot of '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [8]:
#Time series plot of average gap in standardized price
plt.figure(2)
i = 1
for a in smallssd.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
#Updated to reflect standardized price differential
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC/(wrdsData.xs(stock1, level='ticker').PRC[0])
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC/(wrdsData.xs(stock2, level='ticker').PRC[0])
pairsgap = pairsprice1-pairsprice2
plt.subplot(1,5,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Gap Price between '+stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [8]:
# Parallel processing for cointegration test:
def cointegration_test(y, x):
result = stat.OLS(y, x).fit()
return(ts.adfuller(result.resid))
def coint(tlist):
try:
xname = tlist[0]
yname = tlist[1]
x = wrdsData.xs(xname, level='ticker').PRC
y = wrdsData.xs(yname, level='ticker').PRC
xclean = x[x.notnull() & y.notnull()]
yclean = y[x.notnull() & y.notnull()]
xp = list(xclean)
yp = list(yclean)
return([(xname, yname)], cointegration_test(xp,yp)[1]) #get the p-value of test for each pair
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
pool = Pool(processes=4)
#Test just the first 100 pairs - remove [0:100] for full test
cointResult = pd.DataFrame(pool.map(coint, tickerpairs[0:100]))
pool.close()
smallCoint = cointResult.sort(columns = 1)[0:5]
print smallCoint
In [9]:
#Control for false discovery rate using Benjamani Hockberg procedure (From Wikipedia):
#Assume there are m hypothesis tests.
#Order the p-values in increasing order and call them P_(1),P_(2),....,P_(m)
#Then the steps for the procedure are as follows:
#1) For a given alpha, find the largest k such that P_(k) <= (k/m)*alpha
#2) Then reject all H_(i) for i = 1,2,...,k
from __future__ import division #need this bc python cannot do division for integers properly without it
Cointp = cointResult.sort(columns = 1)
cointpvalues = list(Cointp[Cointp.columns[1]])
m = len(cointpvalues)
alpha = 0.2 #False Discovery Rate (20% is used, Can tweak this if necessary)
k = 0
while cointpvalues[k] <= ((k+1)/m)*alpha: #Obtain the k from step 1)
k += 1
print k
print Cointp[:k] #significant pairs under BH procedure
In [8]:
#Plot for
plt.figure(3)
i = 1
for a in smallCoint.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
plt.subplot(1,5,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [9]:
plt.figure(4)
i = 1
for a in smallCoint.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
pairsgap = pairsprice1-pairsprice2
plt.subplot(1,5,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Gap Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [10]:
def correlate(tlist):
try:
xname = tlist[0]
yname = tlist[1]
corrs = np.corrcoef(wrdsData.xs(xname,level='ticker').PRC,wrdsData.xs(yname, level='ticker').PRC)[0,1]
return([(xname, yname)], corrs)
except ValueError:
return()
except TypeError:
return()
if __name__ == '__main__':
pool = Pool(processes=4)
#Test just the first 100 pairs - remove [0:100] for full test
corrpairs = pd.DataFrame(pool.map(correlate, tickerpairs[0:100]))
pool.close()
pairscorrelated = corrpairs.sort(columns = 1, ascending=False)[1:6]
print pairscorrelated
In [11]:
plt.figure(5)
i = 1
for a in pairscorrelated.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
plt.subplot(1,5,i)
plt.plot_date(pairsprice1.index, pairsprice1,'r')
plt.plot_date(pairsprice1.index, pairsprice2,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title(stock1+' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [12]:
plt.figure(6)
i = 1
for a in pairscorrelated.index:
stock1 = tickerpairs[a][0]
stock2 = tickerpairs[a][1]
pairsprice1 = wrdsData.xs(stock1, level='ticker').PRC
pairsprice2 = wrdsData.xs(stock2, level='ticker').PRC
pairsgap = pairsprice1-pairsprice2
plt.subplot(1,5,i)
plt.plot(pairsprice1.index, pairsgap,'b')
plt.legend([stock1,stock2])
plt.xlabel('Date')
plt.ylabel('Gap Price')
plt.title(stock1 +' vs '+stock2)
i += 1
plt.gcf().autofmt_xdate(rotation=90) #Make the data labels on the x-axis vertical
plt.show()
In [10]:
#Histogram of the Correlation of the stock pairs
correls = list(corrpairs[corrpairs.columns[1]])
categories = ['< 0.0','0.0-0.1','0.1-0.2','0.2-0.3','0.3-0.4','0.4-0.5','0.5-0.6','0.6-0.7','0.7-0.8','0.8-0.9','0.9-1.0']
freq0 = len([cor for cor in correls if cor < 0.0])
freq1 = len([cor for cor in correls if 0.0 <= cor < 0.1])
freq2 = len([cor for cor in correls if 0.1 <= cor < 0.2])
freq3 = len([cor for cor in correls if 0.2 <= cor < 0.3])
freq4 = len([cor for cor in correls if 0.3 <= cor < 0.4])
freq5 = len([cor for cor in correls if 0.4 <= cor < 0.5])
freq6 = len([cor for cor in correls if 0.5 <= cor < 0.6])
freq7 = len([cor for cor in correls if 0.6 <= cor < 0.7])
freq8 = len([cor for cor in correls if 0.7 <= cor < 0.8])
freq9 = len([cor for cor in correls if 0.8 <= cor < 0.9])
freq10 = len([cor for cor in correls if 0.9 <= cor < 1.0])
cat = np.arange(len(categories))
frequencies = [freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7,freq8,freq9,freq10]
width = 1.0 # gives histogram aspect to the bar diagram
ax = plt.axes()
ax.set_xticks(cat + (width / 2))
ax.set_xticklabels(categories)
plt.bar(cat,frequencies,width)
plt.xlabel('Correlation Ranges')
plt.ylabel('Frequencies')
plt.title('Histogram of Correlation of Stock Pairs')
plt.show()
In [10]:
In [ ]: