In [ ]:
#https://github.com/matthewgilbert/pdblp

Getting Data


In [10]:
import pandas as pd
import numpy as np
import re

In [11]:
Universe = pd.read_csv('CompositeDef.csv')
Universe = pd.read_csv('UniverseStrategyTS.csv')
Universe = pd.read_csv('EQSecurityMaster.csv')
#Universe = pd.read_csv('Competitors.csv')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-11-8504c47b3929> in <module>()
----> 1 Universe = pd.read_csv('CompositeDef.csv')
      2 Universe = pd.read_csv('UniverseStrategyTS.csv')
      3 Universe = pd.read_csv('EQSecurityMaster.csv')
      4 #Universe = pd.read_csv('Competitors.csv')

c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    700                     skip_blank_lines=skip_blank_lines)
    701 
--> 702         return _read(filepath_or_buffer, kwds)
    703 
    704     parser_f.__name__ = name

c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    427 
    428     # Create the parser.
--> 429     parser = TextFileReader(filepath_or_buffer, **kwds)
    430 
    431     if chunksize or iterator:

c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    893             self.options['has_index_names'] = kwds['has_index_names']
    894 
--> 895         self._make_engine(self.engine)
    896 
    897     def close(self):

c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
   1120     def _make_engine(self, engine='c'):
   1121         if engine == 'c':
-> 1122             self._engine = CParserWrapper(self.f, **self.options)
   1123         else:
   1124             if engine == 'python':

c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
   1851         kwds['usecols'] = self.usecols
   1852 
-> 1853         self._reader = parsers.TextReader(src, **kwds)
   1854         self.unnamed_cols = self._reader.unnamed_cols
   1855 

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File b'CompositeDef.csv' does not exist: b'CompositeDef.csv'

In [ ]:
Universe.Tickers[0:10]

In [ ]:
#[np.isnan(t) for t in Universe.Tickers]
Tickers = Universe.Tickers
iVal = [type(t)==str for t in Tickers]
Tickers = Tickers[iVal]

In [ ]:
pat = re.compile(r'Equity')


#Tickers = Universe.Tickers[~Universe.CheckBB]
#Tickers = Tickers.unique()
Tickers = [t for t in Tickers if pat.search(t)]
Tickers

In [7]:
import pdblp
con = pdblp.BCon(debug=False, port=8194, timeout=5000)

In [9]:
import pdblpext

In [8]:
con.start()

In [13]:
Tickers = np.array(["ENRFCL 1GQ Index", "CFWDCL 1GQ Index"])
Tickers.shape


Out[13]:
(2,)

In [14]:
import numpy as np
iValid = np.full((len(Tickers)), False)

for iT, iTicker in enumerate(Tickers):
    try:
        n = con.ref(iTicker, "NAME")
        iValid[iT] = True
    except:
        print(iT)

In [16]:
iValid
n


Out[16]:
ticker field value
0 CFWDCL 1GQ Index NAME New York Mercantile Exchange/W

In [17]:
#Tickers = Tickers[iValid]
oTickers = Tickers

In [ ]:
Tickers = list()
for f in ['EQTSCUR_MKT_CAP{}.pkl'.format(i) for i in np.arange(0,69)]:
    d = pd.read_pickle(f)
    Tickers = Tickers + d.columns.droplevel(1).tolist()

    
len(Tickers)

In [52]:
from datetime import date, timedelta
todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fromdate = date(2008, 1, 1).strftime('%Y%m%d')
fields = ['PX_LAST', 'BN_SURVEY_AVERAGE', 'BN_SURVEY_LOW', 'BN_SURVEY_HIGH']
#fields = ['PX_LAST', ""
elms = []
Tickers = "ENRFCL 1GQ Index"
#Tickers = "SPX Index"
print(Tickers, fields, todate, fromdate)
a = pd.DataFrame()
for fld in fields:
    a[fld] = con.bdh(Tickers, fld, fromdate, todate).iloc[:, 0]
a


ENRFCL 1GQ Index ['PX_LAST', 'BN_SURVEY_AVERAGE', 'BN_SURVEY_LOW', 'BN_SURVEY_HIGH'] 20190724 20080101
Out[52]:
PX_LAST BN_SURVEY_AVERAGE BN_SURVEY_LOW BN_SURVEY_HIGH
date
2008-01-03 80.00 78.77 60.00 95.0
2008-01-07 80.00 78.31 60.00 95.0
2008-01-09 80.00 79.77 65.00 95.0
2008-01-10 80.00 78.31 60.00 95.0
2008-01-11 80.00 79.47 65.00 95.0
2008-01-14 80.00 79.47 65.00 95.0
2008-01-15 80.00 80.14 65.00 95.0
2008-01-16 80.00 79.68 65.00 95.0
2008-01-17 80.00 80.33 65.00 95.0
2008-01-18 80.00 80.33 65.00 95.0
2008-01-28 80.00 80.33 65.00 95.0
2008-01-30 80.00 80.65 60.00 95.0
2008-02-01 84.00 82.12 60.00 95.0
2008-02-05 83.00 81.32 60.00 95.0
2008-02-06 81.50 81.08 60.00 95.0
2008-02-07 83.73 81.47 60.00 95.0
2008-02-08 83.73 81.47 60.00 95.0
2008-02-12 83.73 81.47 60.00 95.0
2008-02-13 83.73 81.47 60.00 95.0
2008-02-15 83.73 81.47 60.00 95.0
2008-02-18 83.73 81.47 60.00 95.0
2008-02-19 83.73 81.47 60.00 95.0
2008-02-20 83.73 81.47 60.00 95.0
2008-02-22 83.73 81.47 60.00 95.0
2008-02-27 83.73 81.47 60.00 95.0
2008-02-28 85.50 84.82 70.00 98.0
2008-02-29 86.00 83.38 60.00 95.0
2008-03-03 87.90 86.66 70.00 98.0
2008-03-07 86.00 83.38 60.00 95.0
2008-03-11 88.50 85.14 60.00 95.0
... ... ... ... ...
2019-05-23 60.50 60.82 49.70 70.8
2019-05-27 60.50 60.82 49.70 70.8
2019-05-28 60.50 60.82 49.70 70.8
2019-05-29 60.50 60.82 49.70 70.8
2019-05-30 60.50 60.82 49.70 70.8
2019-06-03 60.00 60.64 49.70 70.8
2019-06-04 60.00 60.64 49.70 70.8
2019-06-05 60.50 60.73 49.70 70.8
2019-06-06 60.00 60.16 49.70 70.8
2019-06-07 60.00 60.28 49.70 70.8
2019-06-10 60.00 60.58 49.70 70.8
2019-06-11 60.10 60.99 51.50 70.8
2019-06-12 60.05 60.68 51.50 70.8
2019-06-14 60.05 60.45 51.50 70.8
2019-06-17 60.10 60.47 51.50 70.8
2019-06-24 60.10 60.47 51.50 70.8
2019-06-25 60.10 60.47 51.50 70.8
2019-06-27 60.37 60.53 51.50 70.8
2019-07-01 62.00 61.36 52.00 70.0
2019-07-02 62.00 61.36 52.00 70.0
2019-07-03 62.00 61.36 52.00 70.0
2019-07-05 62.00 61.58 52.93 70.0
2019-07-09 61.50 61.51 52.93 70.0
2019-07-12 62.00 61.88 54.20 70.0
2019-07-16 61.50 61.65 54.20 70.0
2019-07-17 62.00 61.80 54.20 70.0
2019-07-18 61.00 61.51 54.20 70.0
2019-07-19 61.00 61.51 54.20 70.0
2019-07-22 60.50 61.45 54.20 70.0
2019-07-23 61.00 61.68 54.20 70.0

1886 rows × 4 columns


In [53]:
a.plot()


Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x178b54ec0f0>

In [37]:
import ipdb

In [38]:
with ipdb.launch_ipdb_on_exception():
    con.bdh("ENRFCL 1GQ Index", "PX_LAST", fromdate, todate)


ValueError('Length mismatch: Expected axis has 0 elements, new values have 4 elements',)
> c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\core\internals\managers.py(155)set_axis()
    154                 'Length mismatch: Expected axis has {old} elements, new '
--> 155                 'values have {new} elements'.format(old=old_len, new=new_len))
    156 

['axis', 'new_labels', 'new_len', 'old_len', 'self']
BlockManager
Items: Index([], dtype='object')
Axis 1: Index([], dtype='object')
> c:\winpython\python-3.5.4.amd64\lib\site-packages\pandas\core\generic.py(638)_set_axis()
    637     def _set_axis(self, axis, labels):
--> 638         self._data.set_axis(axis, labels)
    639         self._clear_item_cache()

BlockManager
Items: Index([], dtype='object')
Axis 1: Index([], dtype='object')
> c:\dev\arpeggioqi\arpeggioone\arpeggioone\pandas\_libs\properties.pyx(69)pandas._libs.properties.AxisProperty.__set__()

['AxisProperty', 'CachedProperty', '__builtins__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__pyx_unpickle_AxisProperty', '__pyx_unpickle_CachedProperty', '__spec__', '__test__', 'cache_readonly']

In [ ]:
from datetime import date, timedelta
todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fields = {'PX_LAST', 'TOT_RETURN_INDEX_NET_DVDS', 'PX_VOLUME'}
elms = []

todate = date(2018, 6, 30).strftime('%Y%m%d')
fromdate = date(1999, 12, 31).strftime('%Y%m%d')
fields = {'CUR_MKT_CAP'}
elms = [("periodicitySelection", "MONTHLY")]


todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fields = {'TOT_RETURN_INDEX_NET_DVDS'}
elms = []


todate = date(2018, 6, 30).strftime('%Y%m%d')
fromdate = date(1999, 12, 31).strftime('%Y%m%d')
elms = [("periodicitySelection", "QUARTERLY")]
fields = ['FISCAL_YEAR_PERIOD',
    'ANNOUNCEMENT_DT',
    'LATEST_PERIOD_END_DT_FULL_RECORD',
    'BOOK_VAL_PER_SH',
    'TRAIL_12M_EPS',
    'BEST_EPS',
    'EV_TO_T12M_EBITDA',
    'EQY_DVD_YLD_IND_NET']


cTickers = 50

In [ ]:
Tickers = list(Tickers)
#Tickers = Tickers.tolist()
bTickers = [Tickers[i:i + cTickers] for i in range(0, len(Tickers), cTickers)]

In [ ]:
sTickers = bTickers;
for iSet, iTickers in enumerate(bTickers):
    
    for field in fields:
        try:
            sTickers[iSet] = con.bdh(iTickers, field, fromdate, todate, elms)
            sTickers[iSet].to_pickle('./FACTORS{}{}.pkl'.format(field, iSet))
            
        except:
            print("failed")
        finally:
            print(iSet)

In [ ]:


In [ ]:
con.stop()

In [ ]:
for iB in sTickers:
    if isinstance(iB.columns, pd.core.indexes.multi.MultiIndex):
        iB.columns = iB.columns.droplevel(1)

In [ ]:
sTickers = 32*[None]
for iB in range(0, 32):
    sTickers[iB] = pd.read_pickle('./{}.pkl'.format(iB))

In [ ]:
r = pd.concat(sTickers, axis=1)

In [ ]:
r.columns = r.columns.droplevel(1)
y.columns = y.columns.droplevel(1)

In [ ]:
import re
regex = re.compile(".*Equity.*")

iCol = [i for i in r.columns if not regex.match(i)]


r = r.loc[:, iCol]

In [ ]:
print(r.columns.shape)
print(r.columns.unique().shape)

In [ ]:
Xy = pd.concat([r, y], axis=1)

In [ ]:
print(Xy.columns.shape)
print(Xy.columns.unique().shape)
print(r.columns.shape)
print(r.columns.unique().shape)

In [ ]:
Xy

In [ ]:
Xy.to_pickle('all.pkl')

Data Cleaning


In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [ ]:
Xy = pd.read_pickle('all.pkl')

Xy.loc[:, 'QSPNX US Equity']

In [ ]:
Xy.shape

In [ ]:
def isnan_leading(x):
#    return range(1, x.shape[0]+1) * x.notnull() == 0

    return x.ffill().isnull() | x.bfill().isnull()

In [ ]:
# remove before y
iVal = ~isnan_leading(Xy)
Xy = Xy.iloc[iVal.iloc[:, -1].as_matrix(), :]

# remove dates with too little obs
iDVal = Xy.notnull().sum(axis=1)>20
Xy = Xy.iloc[iDVal.as_matrix(), :]

# remove x with too little obs
iXVal = Xy.notnull().sum(axis=0)>1000
Xy = Xy.iloc[:, iXVal.as_matrix()]

# remove x with negative values since that is not a TRI
iXVal = (Xy.fillna(1)>0).all(axis=0)
Xy = Xy.iloc[:, iXVal.as_matrix()]

In [ ]:
# show missing
Xy.notnull().sum(axis=1).plot()
plt.show()

Xy.notnull().sum(axis=0).hist(bins=40)
plt.show()

plt.imshow(Xy.notnull().as_matrix())
plt.show()

In [ ]:
# fillna
iVal = Xy.bfill().isnull()
Xy = Xy.ffill()
Xy.iloc[iVal] = np.nan

In [ ]:
# check na

In [ ]:
# calc ret
TRI = Xy
RET = (TRI/TRI.shift(1)-1).fillna(0)
RET5 = RET.rolling(5).sum().iloc[5:, :]

In [ ]:
Vol = RET5.std()*np.sqrt(250)
Vol.sort_values().plot()
plt.show()

Vol.sort_values()

In [ ]:
Xy.loc[:, "VIX Index"].plot()
plt.show()

In [ ]:
RET = RET / Vol * 0.1
RET5 = RET5 / Vol * 0.1
TRI = (1+RET).cumprod()-1

In [ ]:
from scipy.optimize import nnls

In [ ]:
b0 = np.ones(RET5.shape[1])/RET5.shape[1]

In [ ]:
X5 = RET5.iloc[:, :-1].as_matrix()
y5 = RET5.iloc[:, -1].as_matrix()

b, r = nnls(X5, y5)

In [ ]:
RET.columns.shape

In [ ]:
plt.plot(b)
plt.show()

In [ ]:
yhat = RET.iloc[:, :-1].dot(b)
y = RET.iloc[:, -1]
pd.concat([y, yhat], axis=1).cumsum(axis=0).plot()

plt.show()

Back fill history


In [ ]:
Xy_ = pd.read_pickle('./all.pkl')

Xy_ = Xy_.loc[:, RET.columns.tolist()]

print(Xy_.shape)
print(b.shape)
print(RET.shape)

In [ ]:


In [ ]:
Xy_ = Xy_.ffill()

RET_ = Xy_/Xy_.shift(1)-1
RET_ = RET_ / Vol * 0.1

yhat_ = RET_.iloc[:, :-1].fillna(0).dot(b)
obs_ = RET_.iloc[:, :-1].notnull().dot(b)
y_ = RET_.iloc[:, -1]


obs_.plot()
plt.show()

In [ ]:
Y = pd.concat([y, yhat_, obs_], axis=1)
Y

In [ ]:
Y.iloc[4500:, :2].cumsum(axis=0).plot()
plt.show()

Y.iloc[:, 1].cumsum(axis=0).plot()
plt.show()

In [ ]:
Y.to_csv('out.csv')

In [ ]:

Combine


In [ ]:
import os
import re
import pandas as pd

def GetDataSets():
    file_names = os.listdir('./parts/')
    filesets = set([re.sub('[0-9]+\.pkl', '', fn) for fn in file_names])
    filesets = list(filesets)
    return filesets


filesets = GetDataSets()

In [ ]:


In [ ]:
def CombineParts(fileset):
    pat = '{}[0-9]+\.pkl'.format(fileset)

    files = [fn for fn in os.listdir('./parts/') if re.match(pat, fn)]
    len(files)
    
    parts = []
    for file in files:
        part = pd.read_pickle('./parts/' + file)
        parts.append(part)
        
    parts = pd.concat(parts, axis=1)
    parts.to_pickle('./{}.pkl'.format(fileset))
    
    return parts

In [ ]:
for fs in filesets:
    CombineParts(fs)

In [ ]:


In [ ]:


In [ ]:
import matplotlib.pyplot as plt
parts['.MLREGEF Index'].plot()
plt.show()

In [ ]:
parts.iloc[:, 0:5].plot()
plt.show()

In [ ]:


In [ ]:


In [ ]:
#parts.loc[:, ('.MLREGEF Index', 'PX_LAST')].ffill().bfill().plot()
parts.loc[:, '.MLREGEF Index'].ffill().bfill().plot()
plt.show()

In [ ]:
s = parts.loc[:, '.MLREGEF Index']

s = s[(~s.isnull()).as_matrix()]

s.iloc[3] - s.iloc[4]

In [ ]:
c = parts.columns

In [ ]:
list(c.levels[0])

In [ ]:
A = pd.read_pickle('AssetTS.pkl')
RP = pd.read_pickle('RP.pkl')
C = pd.concat([A, RP], axis=1)
C.to_pickle('NonEQTS.pkl')

In [ ]:
import matplotlib.pyplot as plt
plt.imshow(C)
plt.show()

In [ ]:
C.isnull().sum().sort_values().plot()
plt.show()

In [ ]:


In [ ]:
def GetDataSetsInDir(FileDir = './parts/'):
    print(FileDir)

In [ ]:


In [ ]:
#from pdblpext import PDBLPext

import pdblpext

In [ ]:


In [ ]:
a = PDBLPext(['asdf', 'dddd'])

In [ ]:
a.Tickers

In [ ]:
a.GetTickers()

In [ ]:
a.cBatches

In [ ]:
a.Test = 1

In [ ]:
a

In [ ]:
a.Test

In [ ]: