In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
import pandas as pd
import seaborn as sns
import numpy as np
np.set_printoptions(threshold=30)
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 15)
sns.set(style="whitegrid", context="poster", font_scale=1.3)
Any program is a function from current state of the world $\rightarrow$ side effects.
Our World State:
Our Side Effects:
Many of these atomic computations share a common structure:
Three Major Kinds of Expression:
We can compose symbolic expressions differently based the type of data they produce.
Factors (numerical-valued expressions), Filters (boolean-valued expressions) and Classifiers (categorical-valued expressions).(2) to compute desired portfolio allocations.Symbolic and/or deferred computation frameworks are increasingly the norm for providing a high-level API to performant code.
In the PyData ecosystem alone we have:
SQL or numpy.
In [3]:
from zipline.assets import AssetFinder
finder = AssetFinder("sqlite:///data/assets.db")
lifetimes = finder.lifetimes(
dates=pd.date_range('2001-01-01', '2015-10-01'),
include_start_date=True,
)
lifetimes.head(5)
Out[3]:
In [4]:
daily_count = lifetimes.sum(axis=1)
daily_count.plot(title="Companies in Existence by Day");
In [5]:
AAPL_prices = pd.read_csv(
'data_public/AAPL-split.csv',
parse_dates=['Date'],
index_col='Date',
)
def plot_prices(prices):
price_plot = prices.plot(title='AAPL Price', grid=False)
price_plot.set_ylabel("Price", rotation='horizontal', labelpad=50)
price_plot.vlines(
['2014-05-08'], 0, 700,
label="$3.05 Dividend",
linestyles='dotted',
colors='black',
)
price_plot.vlines(
['2014-06-09'], 0, 700,
label="7:1 Split",
linestyles='--',
colors='black',
)
price_plot.legend()
sns.despine()
return price_plot
In [6]:
plot_prices(AAPL_prices);
In [7]:
naive_returns = AAPL_prices.pct_change()
naive_returns.plot();
Question: What's the "correct" value to return?
Answer: It depends.
Traditional solution to above problems is to use "Adjusted Prices".
Naively, we want to say something like this:
$$D(a, t)$$is the value of dataset $D$ for asset $a$ at time $t$.
This abstraction is broken in the face of splits, dividends, and restatements.
In [8]:
from bcolz import open
from humanize import naturalsize
all_prices = open('data/equity_daily_bars.bcolz')
min_offset = min(all_prices.attrs['calendar_offset'].itervalues())
max_offset = max(all_prices.attrs['calendar_offset'].itervalues())
calendar = pd.DatetimeIndex(all_prices.attrs['calendar'])[min_offset:max_offset]
nassets = len(lifetimes.columns)
ndates = len(calendar)
nfields = len(('id', 'open', 'high', 'low', 'close', 'volume', 'date'))
print "Number of Assets: %d" % nassets
print "Number of Dates: %d" % ndates
print "Naive Dataset Size: %s" % naturalsize(
nassets * ndates * nfields * 8
)
In [9]:
!du -h -d0 data/equity_daily_bars.bcolz
!du -h -d0 data/adjustments.db
Tricks used to make dataset smaller:
Rule 5: Data dominates. If you've chosen the right data structures and organized things well, the algorithms will almost always be self-evident. Data structures, not algorithms, are central to programming. - Rob Pike
In [10]:
import pandas as pd
from zipline.utils.tradingcalendar import trading_day
from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.loaders import USEquityPricingLoader
loader = USEquityPricingLoader.from_files(
'data/equity_daily_bars.bcolz',
'data/adjustments.db'
)
dates = pd.date_range(
'2014-5-20',
'2014-06-30',
freq=trading_day,
tz='UTC',
)
In [11]:
# load_adjusted_array() returns a dictionary mapping columns to instances of `AdjustedArray`.
(closes,) = loader.load_adjusted_array(
columns=[USEquityPricing.close],
dates=dates,
assets=pd.Int64Index([24, 5061]),
mask=None,
).values()
closes
Out[11]:
In [14]:
dates_iter = iter(dates[4:])
window = closes.traverse(5)
window
Out[14]:
In [15]:
# This cell is run multiple times to show the numbers scrolling up until we hit the split.
data = next(window)
print data
print next(dates_iter)