In [ ]:
#https://github.com/matthewgilbert/pdblp
In [10]:
import pandas as pd
import numpy as np
import re
In [11]:
Universe = pd.read_csv('CompositeDef.csv')
Universe = pd.read_csv('UniverseStrategyTS.csv')
Universe = pd.read_csv('EQSecurityMaster.csv')
#Universe = pd.read_csv('Competitors.csv')
In [ ]:
Universe.Tickers[0:10]
In [ ]:
#[np.isnan(t) for t in Universe.Tickers]
Tickers = Universe.Tickers
iVal = [type(t)==str for t in Tickers]
Tickers = Tickers[iVal]
In [ ]:
pat = re.compile(r'Equity')
#Tickers = Universe.Tickers[~Universe.CheckBB]
#Tickers = Tickers.unique()
Tickers = [t for t in Tickers if pat.search(t)]
Tickers
In [7]:
import pdblp
con = pdblp.BCon(debug=False, port=8194, timeout=5000)
In [9]:
import pdblpext
In [8]:
con.start()
In [13]:
Tickers = np.array(["ENRFCL 1GQ Index", "CFWDCL 1GQ Index"])
Tickers.shape
Out[13]:
In [14]:
import numpy as np
iValid = np.full((len(Tickers)), False)
for iT, iTicker in enumerate(Tickers):
try:
n = con.ref(iTicker, "NAME")
iValid[iT] = True
except:
print(iT)
In [16]:
iValid
n
Out[16]:
In [17]:
#Tickers = Tickers[iValid]
oTickers = Tickers
In [ ]:
Tickers = list()
for f in ['EQTSCUR_MKT_CAP{}.pkl'.format(i) for i in np.arange(0,69)]:
d = pd.read_pickle(f)
Tickers = Tickers + d.columns.droplevel(1).tolist()
len(Tickers)
In [52]:
from datetime import date, timedelta
todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fromdate = date(2008, 1, 1).strftime('%Y%m%d')
fields = ['PX_LAST', 'BN_SURVEY_AVERAGE', 'BN_SURVEY_LOW', 'BN_SURVEY_HIGH']
#fields = ['PX_LAST', ""
elms = []
Tickers = "ENRFCL 1GQ Index"
#Tickers = "SPX Index"
print(Tickers, fields, todate, fromdate)
a = pd.DataFrame()
for fld in fields:
a[fld] = con.bdh(Tickers, fld, fromdate, todate).iloc[:, 0]
a
Out[52]:
In [53]:
a.plot()
Out[53]:
In [37]:
import ipdb
In [38]:
with ipdb.launch_ipdb_on_exception():
con.bdh("ENRFCL 1GQ Index", "PX_LAST", fromdate, todate)
In [ ]:
from datetime import date, timedelta
todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fields = {'PX_LAST', 'TOT_RETURN_INDEX_NET_DVDS', 'PX_VOLUME'}
elms = []
todate = date(2018, 6, 30).strftime('%Y%m%d')
fromdate = date(1999, 12, 31).strftime('%Y%m%d')
fields = {'CUR_MKT_CAP'}
elms = [("periodicitySelection", "MONTHLY")]
todate = (date.today() - timedelta(1)).strftime('%Y%m%d')
fromdate = date(2000, 1, 1).strftime('%Y%m%d')
fields = {'TOT_RETURN_INDEX_NET_DVDS'}
elms = []
todate = date(2018, 6, 30).strftime('%Y%m%d')
fromdate = date(1999, 12, 31).strftime('%Y%m%d')
elms = [("periodicitySelection", "QUARTERLY")]
fields = ['FISCAL_YEAR_PERIOD',
'ANNOUNCEMENT_DT',
'LATEST_PERIOD_END_DT_FULL_RECORD',
'BOOK_VAL_PER_SH',
'TRAIL_12M_EPS',
'BEST_EPS',
'EV_TO_T12M_EBITDA',
'EQY_DVD_YLD_IND_NET']
cTickers = 50
In [ ]:
Tickers = list(Tickers)
#Tickers = Tickers.tolist()
bTickers = [Tickers[i:i + cTickers] for i in range(0, len(Tickers), cTickers)]
In [ ]:
sTickers = bTickers;
for iSet, iTickers in enumerate(bTickers):
for field in fields:
try:
sTickers[iSet] = con.bdh(iTickers, field, fromdate, todate, elms)
sTickers[iSet].to_pickle('./FACTORS{}{}.pkl'.format(field, iSet))
except:
print("failed")
finally:
print(iSet)
In [ ]:
In [ ]:
con.stop()
In [ ]:
for iB in sTickers:
if isinstance(iB.columns, pd.core.indexes.multi.MultiIndex):
iB.columns = iB.columns.droplevel(1)
In [ ]:
sTickers = 32*[None]
for iB in range(0, 32):
sTickers[iB] = pd.read_pickle('./{}.pkl'.format(iB))
In [ ]:
r = pd.concat(sTickers, axis=1)
In [ ]:
r.columns = r.columns.droplevel(1)
y.columns = y.columns.droplevel(1)
In [ ]:
import re
regex = re.compile(".*Equity.*")
iCol = [i for i in r.columns if not regex.match(i)]
r = r.loc[:, iCol]
In [ ]:
print(r.columns.shape)
print(r.columns.unique().shape)
In [ ]:
Xy = pd.concat([r, y], axis=1)
In [ ]:
print(Xy.columns.shape)
print(Xy.columns.unique().shape)
print(r.columns.shape)
print(r.columns.unique().shape)
In [ ]:
Xy
In [ ]:
Xy.to_pickle('all.pkl')
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [ ]:
Xy = pd.read_pickle('all.pkl')
Xy.loc[:, 'QSPNX US Equity']
In [ ]:
Xy.shape
In [ ]:
def isnan_leading(x):
# return range(1, x.shape[0]+1) * x.notnull() == 0
return x.ffill().isnull() | x.bfill().isnull()
In [ ]:
# remove before y
iVal = ~isnan_leading(Xy)
Xy = Xy.iloc[iVal.iloc[:, -1].as_matrix(), :]
# remove dates with too little obs
iDVal = Xy.notnull().sum(axis=1)>20
Xy = Xy.iloc[iDVal.as_matrix(), :]
# remove x with too little obs
iXVal = Xy.notnull().sum(axis=0)>1000
Xy = Xy.iloc[:, iXVal.as_matrix()]
# remove x with negative values since that is not a TRI
iXVal = (Xy.fillna(1)>0).all(axis=0)
Xy = Xy.iloc[:, iXVal.as_matrix()]
In [ ]:
# show missing
Xy.notnull().sum(axis=1).plot()
plt.show()
Xy.notnull().sum(axis=0).hist(bins=40)
plt.show()
plt.imshow(Xy.notnull().as_matrix())
plt.show()
In [ ]:
# fillna
iVal = Xy.bfill().isnull()
Xy = Xy.ffill()
Xy.iloc[iVal] = np.nan
In [ ]:
# check na
In [ ]:
# calc ret
TRI = Xy
RET = (TRI/TRI.shift(1)-1).fillna(0)
RET5 = RET.rolling(5).sum().iloc[5:, :]
In [ ]:
Vol = RET5.std()*np.sqrt(250)
Vol.sort_values().plot()
plt.show()
Vol.sort_values()
In [ ]:
Xy.loc[:, "VIX Index"].plot()
plt.show()
In [ ]:
RET = RET / Vol * 0.1
RET5 = RET5 / Vol * 0.1
TRI = (1+RET).cumprod()-1
In [ ]:
from scipy.optimize import nnls
In [ ]:
b0 = np.ones(RET5.shape[1])/RET5.shape[1]
In [ ]:
X5 = RET5.iloc[:, :-1].as_matrix()
y5 = RET5.iloc[:, -1].as_matrix()
b, r = nnls(X5, y5)
In [ ]:
RET.columns.shape
In [ ]:
plt.plot(b)
plt.show()
In [ ]:
yhat = RET.iloc[:, :-1].dot(b)
y = RET.iloc[:, -1]
pd.concat([y, yhat], axis=1).cumsum(axis=0).plot()
plt.show()
In [ ]:
Xy_ = pd.read_pickle('./all.pkl')
Xy_ = Xy_.loc[:, RET.columns.tolist()]
print(Xy_.shape)
print(b.shape)
print(RET.shape)
In [ ]:
In [ ]:
Xy_ = Xy_.ffill()
RET_ = Xy_/Xy_.shift(1)-1
RET_ = RET_ / Vol * 0.1
yhat_ = RET_.iloc[:, :-1].fillna(0).dot(b)
obs_ = RET_.iloc[:, :-1].notnull().dot(b)
y_ = RET_.iloc[:, -1]
obs_.plot()
plt.show()
In [ ]:
Y = pd.concat([y, yhat_, obs_], axis=1)
Y
In [ ]:
Y.iloc[4500:, :2].cumsum(axis=0).plot()
plt.show()
Y.iloc[:, 1].cumsum(axis=0).plot()
plt.show()
In [ ]:
Y.to_csv('out.csv')
In [ ]:
In [ ]:
import os
import re
import pandas as pd
def GetDataSets():
file_names = os.listdir('./parts/')
filesets = set([re.sub('[0-9]+\.pkl', '', fn) for fn in file_names])
filesets = list(filesets)
return filesets
filesets = GetDataSets()
In [ ]:
In [ ]:
def CombineParts(fileset):
pat = '{}[0-9]+\.pkl'.format(fileset)
files = [fn for fn in os.listdir('./parts/') if re.match(pat, fn)]
len(files)
parts = []
for file in files:
part = pd.read_pickle('./parts/' + file)
parts.append(part)
parts = pd.concat(parts, axis=1)
parts.to_pickle('./{}.pkl'.format(fileset))
return parts
In [ ]:
for fs in filesets:
CombineParts(fs)
In [ ]:
In [ ]:
In [ ]:
import matplotlib.pyplot as plt
parts['.MLREGEF Index'].plot()
plt.show()
In [ ]:
parts.iloc[:, 0:5].plot()
plt.show()
In [ ]:
In [ ]:
In [ ]:
#parts.loc[:, ('.MLREGEF Index', 'PX_LAST')].ffill().bfill().plot()
parts.loc[:, '.MLREGEF Index'].ffill().bfill().plot()
plt.show()
In [ ]:
s = parts.loc[:, '.MLREGEF Index']
s = s[(~s.isnull()).as_matrix()]
s.iloc[3] - s.iloc[4]
In [ ]:
c = parts.columns
In [ ]:
list(c.levels[0])
In [ ]:
A = pd.read_pickle('AssetTS.pkl')
RP = pd.read_pickle('RP.pkl')
C = pd.concat([A, RP], axis=1)
C.to_pickle('NonEQTS.pkl')
In [ ]:
import matplotlib.pyplot as plt
plt.imshow(C)
plt.show()
In [ ]:
C.isnull().sum().sort_values().plot()
plt.show()
In [ ]:
In [ ]:
def GetDataSetsInDir(FileDir = './parts/'):
print(FileDir)
In [ ]:
In [ ]:
#from pdblpext import PDBLPext
import pdblpext
In [ ]:
In [ ]:
a = PDBLPext(['asdf', 'dddd'])
In [ ]:
a.Tickers
In [ ]:
a.GetTickers()
In [ ]:
a.cBatches
In [ ]:
a.Test = 1
In [ ]:
a
In [ ]:
a.Test
In [ ]: