In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
%matplotlib notebook
import tensorflow

In [71]:
INTERVAL = '10s'
HORIZON = pd.Timedelta(INTERVAL)*30  # forecast horizon
DATA_PATH = '/Users/felipe/bitcoin/{type}/{date}.csv.gz'
OUTPUT = '/Users/felipe/bitcoin/data/{date}-training.csv'
SPANS = [2, 3, 6, 15, 30, 60]  # inverval units

In [60]:
FOR_DATE = '20190515'

In [66]:
def read_trades(date):
    date = str(date)
    path = DATA_PATH.format(type='trades', date=date)
    t = pd.read_csv(path,
                    index_col='timestamp',
                    parse_dates=True,
                    infer_datetime_format=True,
                    usecols=['timestamp', 'symbol', 'price', 'side', 'size'])
    t.index.name = 'time'
    t = t[(t.symbol == 'XBTUSD') & (t.price > 1)]
    t = t.dropna()
    t.drop(columns={'symbol'}, inplace=True)
    t = t[['price', 'side', 'size']]
    return t

def read_quotes(date):
    date = str(date)
    path = DATA_PATH.format(type='quotes', date=date)
    t = pd.read_csv(path,
                    index_col='timestamp',
                    parse_dates=True,
                    infer_datetime_format=True,
                    usecols=['timestamp', 'symbol', 'bidPrice', 'askPrice', 'bidSize', 'askSize'])
    t.index.name = 'time'
    t = t[(t.symbol == 'XBTUSD') & (t.bidPrice > 1) & (t.askPrice > 1) & (t.bidPrice < t.askPrice)]
    t = t.dropna()
    t.drop(columns={'symbol'}, inplace=True)
    t = t[['bidPrice', 'askPrice', 'bidSize', 'askSize']]
    return t

In [67]:
def ema(df, spans, columns):
    """ computes ewm for each column, for each span in spans"""
    dfs = [df]
    for span in spans:
        cols = {i: f'E{span}{i}' for i in columns}
        dfs.append(df[columns].ewm(span=span).mean().rename(columns=cols))
    return pd.concat(dfs, axis=1)

In [68]:
# cols 'symbol', 'side', 'price', 'size', 'tickDirection'
t = read_trades(FOR_DATE)

In [ ]:
t.loc[t.side == 'Sell', 'size'] *= -1
t.rename(columns={'size': 'boughtSum'}, inplace=True)
t['soldSum'] = t['boughtSum']
t = t[['boughtSum', 'soldSum']]
t['boughtSum'].clip_lower(0, inplace=True)
t['soldSum'].clip_upper(0, inplace=True)
t['soldSum'] *= -1
t = t.resample(INTERVAL).agg('sum').fillna(method='ffill')
t = t[['boughtSum', 'soldSum']]

In [ ]:
t.head()

In [ ]:
len(t)

In [ ]:
q = read_timeseries(filename=QUOTES_PATH,
                    cols=['bidPrice', 'askPrice', 'bidSize', 'askSize'])
q = q[['bidPrice', 'askPrice', 'bidSize', 'askSize']]

In [ ]:
def add_fcst(q, horizon=HORIZON):
    with pd.option_context('mode.chained_assignment', None):
        bidMax = q['bidPrice'].rolling(horizon).max()
        askMin = q['askPrice'].rolling(horizon).min()
        q['longPnl'] = bidMax - q['askPrice']
        q['shortPnl'] = q['bidPrice'] - askMin
    return q

add_fcst(q)

In [ ]:


In [ ]:
q['spread'] = (q['askPrice'] - q['bidPrice']) / 0.5  # in Tick unit
q = q.resample(INTERVAL).agg('mean').fillna(method='ffill')
q.rename(columns={c: c+'Avg' for c in q.columns}, inplace=True)

In [ ]:
q.head()

In [ ]:
len(q)

In [ ]:
df = pd.concat([t, q], axis=1)
Y_cols = ['longPnlAvg', 'shortPnlAvg']
X_cols = ['boughtSum', 'soldSum', 'bidPriceAvg', 'askPriceAvg', 'bidSizeAvg', 'askSizeAvg', 'spreadAvg']
df = df[Y_cols + X_cols]

In [ ]:
len(df)

In [ ]:
spans = [2, 3, 6, 15, 30, 60]

In [ ]:
df = ema(df, spans, X_cols)

In [ ]:
df.head()

In [72]:
df.to_csv(OUTPUT.format(date=FOR_DATE))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-72-00d1f155941f> in <module>()
----> 1 df.to_csv(OUTPUT.format(date=FOR_DATE))

NameError: name 'df' is not defined

In [ ]:
len(df)

In [1]:
import pandas as pd

In [6]:
%pinfo pd.DataFrame.rolling


Signature: pd.DataFrame.rolling(self, window, min_periods=None, freq=None, center=False, win_type=None, on=None, axis=0, closed=None)
Docstring:
Provides rolling window calculations.

.. versionadded:: 0.18.0

Parameters
----------
window : int, or offset
    Size of the moving window. This is the number of observations used for
    calculating the statistic. Each window will be a fixed size.

    If its an offset then this will be the time period of each window. Each
    window will be a variable sized based on the observations included in
    the time-period. This is only valid for datetimelike indexes. This is
    new in 0.19.0
min_periods : int, default None
    Minimum number of observations in window required to have a value
    (otherwise result is NA). For a window that is specified by an offset,
    this will default to 1.
freq : string or DateOffset object, optional (default None)
    .. deprecated:: 0.18.0
       Frequency to conform the data to before computing the statistic.
       Specified as a frequency string or DateOffset object.
center : boolean, default False
    Set the labels at the center of the window.
win_type : string, default None
    Provide a window type. See the notes below.
on : string, optional
    For a DataFrame, column on which to calculate
    the rolling window, rather than the index
closed : string, default None
    Make the interval closed on the 'right', 'left', 'both' or
    'neither' endpoints.
    For offset-based windows, it defaults to 'right'.
    For fixed windows, defaults to 'both'. Remaining cases not implemented
    for fixed windows.

    .. versionadded:: 0.20.0

axis : int or string, default 0

Returns
-------
a Window or Rolling sub-classed for the particular operation

Examples
--------

>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
>>> df
     B
0  0.0
1  1.0
2  2.0
3  NaN
4  4.0

Rolling sum with a window length of 2, using the 'triang'
window type.

>>> df.rolling(2, win_type='triang').sum()
     B
0  NaN
1  1.0
2  2.5
3  NaN
4  NaN

Rolling sum with a window length of 2, min_periods defaults
to the window length.

>>> df.rolling(2).sum()
     B
0  NaN
1  1.0
2  3.0
3  NaN
4  NaN

Same as above, but explicity set the min_periods

>>> df.rolling(2, min_periods=1).sum()
     B
0  0.0
1  1.0
2  3.0
3  2.0
4  4.0

A ragged (meaning not-a-regular frequency), time-indexed DataFrame

>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
....:                 index = [pd.Timestamp('20130101 09:00:00'),
....:                          pd.Timestamp('20130101 09:00:02'),
....:                          pd.Timestamp('20130101 09:00:03'),
....:                          pd.Timestamp('20130101 09:00:05'),
....:                          pd.Timestamp('20130101 09:00:06')])

>>> df
                       B
2013-01-01 09:00:00  0.0
2013-01-01 09:00:02  1.0
2013-01-01 09:00:03  2.0
2013-01-01 09:00:05  NaN
2013-01-01 09:00:06  4.0


Contrasting to an integer rolling window, this will roll a variable
length window corresponding to the time period.
The default for min_periods is 1.

>>> df.rolling('2s').sum()
                       B
2013-01-01 09:00:00  0.0
2013-01-01 09:00:02  1.0
2013-01-01 09:00:03  3.0
2013-01-01 09:00:05  NaN
2013-01-01 09:00:06  4.0

Notes
-----
By default, the result is set to the right edge of the window. This can be
changed to the center of the window by setting ``center=True``.

The `freq` keyword is used to conform time series data to a specified
frequency by resampling the data. This is done with the default parameters
of :meth:`~pandas.Series.resample` (i.e. using the `mean`).

To learn more about the offsets & frequency strings, please see `this link
<http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.

The recognized win_types are:

* ``boxcar``
* ``triang``
* ``blackman``
* ``hamming``
* ``bartlett``
* ``parzen``
* ``bohman``
* ``blackmanharris``
* ``nuttall``
* ``barthann``
* ``kaiser`` (needs beta)
* ``gaussian`` (needs std)
* ``general_gaussian`` (needs power, width)
* ``slepian`` (needs width).

If ``win_type=None`` all points are evenly weighted. To learn more about
different window types see `scipy.signal window functions
<https://docs.scipy.org/doc/scipy/reference/signal.html#window-functions>`__.
File:      ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py
Type:      function

In [ ]: