Import necessary modules


In [1]:
import numpy as np
import pandas as pd
import pprint

Filepath management


In [2]:
project_dir = r'/Users/hudson/Code/marketModel/'

Load tagged data


In [3]:
stock_data = pd.read_hdf(project_dir + 'data/stock_data/tagged_stock_data.hdf', 'table')
stock_data.head()


Out[3]:
ticker open high low close volume drop_goal_met goal_met raise_goal_met statusMessage
timestamp
2017-08-09 13:30:00 APHB 0.8001 0.82 0.8001 0.82 3138.0 False False False
2017-08-09 13:31:00 APHB 0.8200 0.82 0.8200 0.82 0.0 False False False
2017-08-09 13:32:00 APHB 0.8200 0.82 0.8200 0.82 0.0 False False False
2017-08-09 13:33:00 APHB 0.8200 0.82 0.8200 0.82 0.0 False False False
2017-08-09 13:34:00 APHB 0.8200 0.82 0.8200 0.82 0.0 False False False

Reshape the stock data

Keras expects as input a numpy array of the following shape (number_sequences, number_timesteps, number_features)

First, filter so all tickers have the same number of minutes


In [4]:
minmax_times = stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times


Out[4]:
amin amax
ticker
APHB 2017-08-09 13:30:00 2017-09-06 15:54:00
ARDM 2017-08-09 13:30:00 2017-09-06 15:52:00
ASTC 2017-08-09 13:30:00 2017-09-06 15:50:00
ATLC 2017-08-09 13:30:00 2017-09-06 14:52:00
AVIR 2017-08-09 13:30:00 2017-09-06 15:50:00
BNSO 2017-08-09 14:05:00 2017-09-05 16:00:00
CODA 2017-08-09 14:30:00 2017-09-06 15:07:00
CPST 2017-08-09 13:30:00 2017-09-06 15:57:00
CYAN 2017-08-10 13:30:00 2017-09-06 14:33:00
DTRM 2017-08-09 13:30:00 2017-09-06 15:57:00
FORD 2017-08-09 13:30:00 2017-09-06 15:40:00
GBR 2017-08-09 13:38:00 2017-09-06 15:58:00
JCS 2017-08-09 13:30:00 2017-09-06 15:53:00
JVA 2017-08-09 14:04:00 2017-09-06 14:46:00
LEU 2017-08-09 13:30:00 2017-09-06 15:42:00
LRAD 2017-08-09 13:30:00 2017-09-06 15:07:00
MICR 2017-08-09 14:05:00 2017-09-06 15:34:00
MOC 2017-08-09 14:27:00 2017-09-06 15:34:00
PAVM 2017-08-09 15:49:00 2017-09-06 15:52:00
REFR 2017-08-09 13:30:00 2017-09-06 15:57:00
SEAC 2017-08-09 13:45:00 2017-09-06 15:58:00
UTSI 2017-08-09 13:45:00 2017-09-06 15:50:00
VSR 2017-08-09 13:30:00 2017-09-06 15:56:00
WSTL 2017-08-09 13:30:00 2017-09-06 15:19:00
WTT 2017-08-09 14:14:00 2017-09-06 15:57:00
ZDGE 2017-08-09 13:30:00 2017-09-06 14:37:00

In [5]:
min_time, max_time = minmax_times.agg({'amin': np.max, 'amax': np.min})
print min_time, max_time


2017-08-10 13:30:00 2017-09-05 16:00:00

In [6]:
unindexed_stock_data = stock_data.reset_index()
unindexed_stock_data = unindexed_stock_data.loc[(min_time <= unindexed_stock_data.timestamp) & (unindexed_stock_data.timestamp <= max_time)].set_index('timestamp')
print "Stock data shape: " + str(stock_data.shape)
print "Filtered data shape: " + str(unindexed_stock_data.shape)


Stock data shape: (286722, 10)
Filtered data shape: (268242, 10)

In [7]:
minmax_times = unindexed_stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times


Out[7]:
amin amax
ticker
APHB 2017-08-10 13:30:00 2017-09-05 16:00:00
ARDM 2017-08-10 13:30:00 2017-09-05 16:00:00
ASTC 2017-08-10 13:30:00 2017-09-05 16:00:00
ATLC 2017-08-10 13:30:00 2017-09-05 16:00:00
AVIR 2017-08-10 13:30:00 2017-09-05 16:00:00
BNSO 2017-08-10 13:30:00 2017-09-05 16:00:00
CODA 2017-08-10 13:30:00 2017-09-05 16:00:00
CPST 2017-08-10 13:30:00 2017-09-05 16:00:00
CYAN 2017-08-10 13:30:00 2017-09-05 16:00:00
DTRM 2017-08-10 13:30:00 2017-09-05 16:00:00
FORD 2017-08-10 13:30:00 2017-09-05 16:00:00
GBR 2017-08-10 13:30:00 2017-09-05 16:00:00
JCS 2017-08-10 13:30:00 2017-09-05 16:00:00
JVA 2017-08-10 13:30:00 2017-09-05 16:00:00
LEU 2017-08-10 13:30:00 2017-09-05 16:00:00
LRAD 2017-08-10 13:30:00 2017-09-05 16:00:00
MICR 2017-08-10 13:30:00 2017-09-05 16:00:00
MOC 2017-08-10 13:30:00 2017-09-05 16:00:00
PAVM 2017-08-10 13:30:00 2017-09-05 16:00:00
REFR 2017-08-10 13:30:00 2017-09-05 16:00:00
SEAC 2017-08-10 13:30:00 2017-09-05 16:00:00
UTSI 2017-08-10 13:30:00 2017-09-05 16:00:00
VSR 2017-08-10 13:30:00 2017-09-05 16:00:00
WSTL 2017-08-10 13:30:00 2017-09-05 16:00:00
WTT 2017-08-10 13:30:00 2017-09-05 16:00:00
ZDGE 2017-08-10 13:30:00 2017-09-05 16:00:00

In [8]:
stock_data = unindexed_stock_data

Now start reshaping!


In [9]:
# They should all have the same number of rows and columns now. 
# Rows are minutes and columns are stock prices and tagging info
num_minutes = []
for name, group in stock_data.groupby('ticker'):
    print name, group.shape
    num_minutes.append(group.shape[0])

num_minutes = int(np.unique(num_minutes))
print num_minutes


APHB (10317, 10)
ARDM (10317, 10)
ASTC (10317, 10)
ATLC (10317, 10)
AVIR (10317, 10)
BNSO (10317, 10)
CODA (10317, 10)
CPST (10317, 10)
CYAN (10317, 10)
DTRM (10317, 10)
FORD (10317, 10)
GBR (10317, 10)
JCS (10317, 10)
JVA (10317, 10)
LEU (10317, 10)
LRAD (10317, 10)
MICR (10317, 10)
MOC (10317, 10)
PAVM (10317, 10)
REFR (10317, 10)
SEAC (10317, 10)
UTSI (10317, 10)
VSR (10317, 10)
WSTL (10317, 10)
WTT (10317, 10)
ZDGE (10317, 10)
10317

In [10]:
# Looks good. Let's now make sure that the stock data is sorted by ticker and timestampe
stock_data_sorted = stock_data.reset_index().sort_values(['ticker', 'timestamp']).set_index('timestamp')

X = stock_data_sorted.loc[:,'ticker':'volume']
Y = stock_data_sorted.loc[:,('ticker','goal_met')]

In [11]:
print X.shape[0]/26
print Y.shape


10317
(268242, 2)

In [12]:
def rolling_window(a, step):
    shape   = a.shape[:-1] + (a.shape[-1] - step + 1, step)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

In [13]:
window_size = 360
num_features = 5

# Shape the X array
X_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1, 
                         window_size, 
                         num_features *  X.ticker.unique().size))
                         
for idx, (name, group) in enumerate(X.groupby('ticker')):
    group_mx = group.loc[:,'open':'volume'].transpose().as_matrix()
    X_shaped_array[:,:,num_features*idx:(num_features * (idx+1))] = \
                          np.rollaxis(rolling_window(group_mx, window_size), 0, 3)

In [14]:
print X_shaped_array.shape
print X_shaped_array[0, 0:3, 0:5]
print X.head(3)


(9958, 360, 130)
[[  8.00000000e-01   8.00000000e-01   8.00000000e-01   8.00000000e-01
    1.59000000e+03]
 [  8.00000000e-01   8.00000000e-01   8.00000000e-01   8.00000000e-01
    0.00000000e+00]
 [  7.98000000e-01   8.00000000e-01   7.98000000e-01   8.00000000e-01
    1.00000000e+04]]
                    ticker   open  high    low  close   volume
timestamp                                                     
2017-08-10 13:30:00   APHB  0.800   0.8  0.800    0.8   1590.0
2017-08-10 13:31:00   APHB  0.800   0.8  0.800    0.8      0.0
2017-08-10 13:32:00   APHB  0.798   0.8  0.798    0.8  10000.0

In [15]:
# Shape the Y array
Y_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1, X.ticker.unique().size))

for idx, (name, group) in enumerate(Y.groupby('ticker')):
    group_mx = group.loc[:,'goal_met'].as_matrix()
    Y_shaped_array[:,idx] = group_mx[window_size-1:]

In [16]:
print Y_shaped_array.shape
print (Y_shaped_array==1).sum()
print Y.groupby('ticker').agg({'goal_met': lambda x: x.iloc[window_size-1:].sum()}).agg(np.sum)


(9958, 26)
10389
goal_met    10389
dtype: int64

In [17]:
9317./10218./26.


Out[17]:
0.03507008747760362

Save the numpy arrays


In [18]:
np.savez(project_dir + 'data/trainingData.npz', X=X_shaped_array, Y=Y_shaped_array)