In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
from itertools import combinations
import multiprocessing
from multiprocessing import Pool
import cPickle as pkl
In [2]:
# Create list of pairs
pairList = list(combinations(range(10), 2))
# Create distance test data
data = np.array([np.arange(10)**2]*10)+1
distDF = pd.DataFrame(data)
distDF.head()
Out[2]:
In [3]:
# Define distance function
def dist(tlist):
xname = tlist[0]
yname = tlist[1]
# Don't standardize since constant values
x = distDF[xname]#/(distDF[xname][0])
y = distDF[yname]#/(distDF[yname][0])
# Remove any na values
z = (x-y).dropna()
# Only consider pairs with most of the data present
# if len(z) > 495:
return([(xname, yname)], sum(map(lambda z:z**2, z)))
#else:
# return()
In [4]:
# Define distance test function
def test_distFun():
if __name__ == '__main__':
trainDistPool = Pool(processes=4)
# Test just the first 100 pairs - remove [0:100] for full test - warning, takes a long time!!
trainDistResult = pd.DataFrame(trainDistPool.map(dist, pairList))
print trainDistResult[1][4]
trainDistPool.close()
assert trainDistResult[1][4] == 10*(5**4)
In [21]:
## Successful test data
#Use a standard normal distribution to generate one data set, then generate another using the standard normal data set and
#add or subtracting any number between 0 and 1 to get two cointegrated data sets.
normalrand = np.random.normal(0,1,100) #normal(0,1)
cointnorm = normalrand + np.random.uniform(-1,1,100) #normal(0,1) + uniform(-1,1)
In [22]:
## Fail test data
#Example 1
x = np.cumsum(np.random.normal(0,1, 1000))
y = np.cumsum(np.random.normal(0,1, 1000))
#Example 2
x1 = np.cumsum(np.random.uniform(1000,10000,1000))
y1 = np.cumsum(np.random.exponential(10,1000))
In [23]:
# Define cointegration function
def cointegration(y, x):
ctresult = stat.OLS(y, x).fit()
return(ts.adfuller(ctresult.resid))
In [24]:
# Define cointegration test
def test_cointFun():
assert cointegration(normalrand,cointnorm)[1] <= 0.05
def test_cointFail():
assert cointegration(y,x)[1] >= 0.05
def test_cointFail2():
assert cointegration(y1,x1)[1] >= 0.05
In [9]:
#For correlated data, use standard normal distribution so we can use the bivariate normal formula to generate two sets of
#correlated data.
rho = .5 #pick some correlation threshold
xCorr = np.random.normal(0,1,100)
yCorr = rho*xCorr + (1-rho**2)*np.random.normal(0,1,100) #bivariate normal
In [10]:
#define correlation function
def correlate(tlist):
try:
xname = tlist[0]
yname = tlist[1]
x = xname #trainData[xname]
y = yname #trainData[yname]
#if min(x.count(), y.count()) > 490:
#corrs = x.corr(y)
corrs = np.corrcoef(x,y)[0,1]
return corrs
#else:
# return()
except ValueError:
return()
except TypeError:
return()
In [11]:
def test_corrFun():
assert abs(correlate([xCorr,yCorr])-rho) <= 0.2
In [12]:
%load_ext ipython_nose
In [25]:
%nose -v
Out[25]:
In [ ]: