Import necessary modules



In [1]:

    
import numpy as np
import pandas as pd
import pprint

Filepath management



In [2]:

    
project_dir = r'/Users/hudson/Code/marketModel/'

Load tagged data



In [3]:

    
stock_data = pd.read_hdf(project_dir + 'data/stock_data/tagged_stock_data.hdf', 'table')
stock_data.head()









    Out[3]:







  
    
      
      ticker
      open
      high
      low
      close
      volume
      drop_goal_met
      goal_met
      raise_goal_met
      statusMessage
    
    
      timestamp
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017-08-09 13:30:00
      APHB
      0.8001
      0.82
      0.8001
      0.82
      3138.0
      False
      False
      False
      
    
    
      2017-08-09 13:31:00
      APHB
      0.8200
      0.82
      0.8200
      0.82
      0.0
      False
      False
      False
      
    
    
      2017-08-09 13:32:00
      APHB
      0.8200
      0.82
      0.8200
      0.82
      0.0
      False
      False
      False
      
    
    
      2017-08-09 13:33:00
      APHB
      0.8200
      0.82
      0.8200
      0.82
      0.0
      False
      False
      False
      
    
    
      2017-08-09 13:34:00
      APHB
      0.8200
      0.82
      0.8200
      0.82
      0.0
      False
      False
      False

Reshape the stock data

Keras expects as input a numpy array of the following shape (number_sequences, number_timesteps, number_features)

First, filter so all tickers have the same number of minutes



In [4]:

    
minmax_times = stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times









    Out[4]:







  
    
      
      amin
      amax
    
    
      ticker
      
      
    
  
  
    
      APHB
      2017-08-09 13:30:00
      2017-09-06 15:54:00
    
    
      ARDM
      2017-08-09 13:30:00
      2017-09-06 15:52:00
    
    
      ASTC
      2017-08-09 13:30:00
      2017-09-06 15:50:00
    
    
      ATLC
      2017-08-09 13:30:00
      2017-09-06 14:52:00
    
    
      AVIR
      2017-08-09 13:30:00
      2017-09-06 15:50:00
    
    
      BNSO
      2017-08-09 14:05:00
      2017-09-05 16:00:00
    
    
      CODA
      2017-08-09 14:30:00
      2017-09-06 15:07:00
    
    
      CPST
      2017-08-09 13:30:00
      2017-09-06 15:57:00
    
    
      CYAN
      2017-08-10 13:30:00
      2017-09-06 14:33:00
    
    
      DTRM
      2017-08-09 13:30:00
      2017-09-06 15:57:00
    
    
      FORD
      2017-08-09 13:30:00
      2017-09-06 15:40:00
    
    
      GBR
      2017-08-09 13:38:00
      2017-09-06 15:58:00
    
    
      JCS
      2017-08-09 13:30:00
      2017-09-06 15:53:00
    
    
      JVA
      2017-08-09 14:04:00
      2017-09-06 14:46:00
    
    
      LEU
      2017-08-09 13:30:00
      2017-09-06 15:42:00
    
    
      LRAD
      2017-08-09 13:30:00
      2017-09-06 15:07:00
    
    
      MICR
      2017-08-09 14:05:00
      2017-09-06 15:34:00
    
    
      MOC
      2017-08-09 14:27:00
      2017-09-06 15:34:00
    
    
      PAVM
      2017-08-09 15:49:00
      2017-09-06 15:52:00
    
    
      REFR
      2017-08-09 13:30:00
      2017-09-06 15:57:00
    
    
      SEAC
      2017-08-09 13:45:00
      2017-09-06 15:58:00
    
    
      UTSI
      2017-08-09 13:45:00
      2017-09-06 15:50:00
    
    
      VSR
      2017-08-09 13:30:00
      2017-09-06 15:56:00
    
    
      WSTL
      2017-08-09 13:30:00
      2017-09-06 15:19:00
    
    
      WTT
      2017-08-09 14:14:00
      2017-09-06 15:57:00
    
    
      ZDGE
      2017-08-09 13:30:00
      2017-09-06 14:37:00



In [5]:

    
min_time, max_time = minmax_times.agg({'amin': np.max, 'amax': np.min})
print min_time, max_time









    



2017-08-10 13:30:00 2017-09-05 16:00:00



In [6]:

    
unindexed_stock_data = stock_data.reset_index()
unindexed_stock_data = unindexed_stock_data.loc[(min_time <= unindexed_stock_data.timestamp) & (unindexed_stock_data.timestamp <= max_time)].set_index('timestamp')
print "Stock data shape: " + str(stock_data.shape)
print "Filtered data shape: " + str(unindexed_stock_data.shape)









    



Stock data shape: (286722, 10)
Filtered data shape: (268242, 10)



In [7]:

    
minmax_times = unindexed_stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times









    Out[7]:







  
    
      
      amin
      amax
    
    
      ticker
      
      
    
  
  
    
      APHB
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      ARDM
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      ASTC
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      ATLC
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      AVIR
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      BNSO
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      CODA
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      CPST
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      CYAN
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      DTRM
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      FORD
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      GBR
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      JCS
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      JVA
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      LEU
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      LRAD
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      MICR
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      MOC
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      PAVM
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      REFR
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      SEAC
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      UTSI
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      VSR
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      WSTL
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      WTT
      2017-08-10 13:30:00
      2017-09-05 16:00:00
    
    
      ZDGE
      2017-08-10 13:30:00
      2017-09-05 16:00:00



In [8]:

    
stock_data = unindexed_stock_data

Now start reshaping!



In [9]:

    
# They should all have the same number of rows and columns now. 
# Rows are minutes and columns are stock prices and tagging info
num_minutes = []
for name, group in stock_data.groupby('ticker'):
    print name, group.shape
    num_minutes.append(group.shape[0])

num_minutes = int(np.unique(num_minutes))
print num_minutes









    



APHB (10317, 10)
ARDM (10317, 10)
ASTC (10317, 10)
ATLC (10317, 10)
AVIR (10317, 10)
BNSO (10317, 10)
CODA (10317, 10)
CPST (10317, 10)
CYAN (10317, 10)
DTRM (10317, 10)
FORD (10317, 10)
GBR (10317, 10)
JCS (10317, 10)
JVA (10317, 10)
LEU (10317, 10)
LRAD (10317, 10)
MICR (10317, 10)
MOC (10317, 10)
PAVM (10317, 10)
REFR (10317, 10)
SEAC (10317, 10)
UTSI (10317, 10)
VSR (10317, 10)
WSTL (10317, 10)
WTT (10317, 10)
ZDGE (10317, 10)
10317



In [10]:

    
# Looks good. Let's now make sure that the stock data is sorted by ticker and timestampe
stock_data_sorted = stock_data.reset_index().sort_values(['ticker', 'timestamp']).set_index('timestamp')

X = stock_data_sorted.loc[:,'ticker':'volume']
Y = stock_data_sorted.loc[:,('ticker','goal_met')]



In [11]:

    
print X.shape[0]/26
print Y.shape









    



10317
(268242, 2)



In [12]:

    
def rolling_window(a, step):
    shape   = a.shape[:-1] + (a.shape[-1] - step + 1, step)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)



In [13]:

    
window_size = 360
num_features = 5

# Shape the X array
X_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1, 
                         window_size, 
                         num_features *  X.ticker.unique().size))
                         
for idx, (name, group) in enumerate(X.groupby('ticker')):
    group_mx = group.loc[:,'open':'volume'].transpose().as_matrix()
    X_shaped_array[:,:,num_features*idx:(num_features * (idx+1))] = \
                          np.rollaxis(rolling_window(group_mx, window_size), 0, 3)



In [14]:

    
print X_shaped_array.shape
print X_shaped_array[0, 0:3, 0:5]
print X.head(3)









    



(9958, 360, 130)
[[  8.00000000e-01   8.00000000e-01   8.00000000e-01   8.00000000e-01
    1.59000000e+03]
 [  8.00000000e-01   8.00000000e-01   8.00000000e-01   8.00000000e-01
    0.00000000e+00]
 [  7.98000000e-01   8.00000000e-01   7.98000000e-01   8.00000000e-01
    1.00000000e+04]]
                    ticker   open  high    low  close   volume
timestamp                                                     
2017-08-10 13:30:00   APHB  0.800   0.8  0.800    0.8   1590.0
2017-08-10 13:31:00   APHB  0.800   0.8  0.800    0.8      0.0
2017-08-10 13:32:00   APHB  0.798   0.8  0.798    0.8  10000.0



In [15]:

    
# Shape the Y array
Y_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1, X.ticker.unique().size))

for idx, (name, group) in enumerate(Y.groupby('ticker')):
    group_mx = group.loc[:,'goal_met'].as_matrix()
    Y_shaped_array[:,idx] = group_mx[window_size-1:]



In [16]:

    
print Y_shaped_array.shape
print (Y_shaped_array==1).sum()
print Y.groupby('ticker').agg({'goal_met': lambda x: x.iloc[window_size-1:].sum()}).agg(np.sum)









    



(9958, 26)
10389
goal_met    10389
dtype: int64



In [17]:

    
9317./10218./26.









    Out[17]:





0.03507008747760362

Save the numpy arrays



In [18]:

    
np.savez(project_dir + 'data/trainingData.npz', X=X_shaped_array, Y=Y_shaped_array)

	ticker	open	high	low	close	volume	drop_goal_met	goal_met	raise_goal_met
timestamp
2017-08-09 13:30:00	APHB	0.8001	0.82	0.8001	0.82	3138.0	False	False	False
2017-08-09 13:31:00	APHB	0.8200	0.82	0.8200	0.82	0.0	False	False	False
2017-08-09 13:32:00	APHB	0.8200	0.82	0.8200	0.82	0.0	False	False	False
2017-08-09 13:33:00	APHB	0.8200	0.82	0.8200	0.82	0.0	False	False	False
2017-08-09 13:34:00	APHB	0.8200	0.82	0.8200	0.82	0.0	False	False	False

	amin	amax
ticker
APHB	2017-08-09 13:30:00	2017-09-06 15:54:00
ARDM	2017-08-09 13:30:00	2017-09-06 15:52:00
ASTC	2017-08-09 13:30:00	2017-09-06 15:50:00
ATLC	2017-08-09 13:30:00	2017-09-06 14:52:00
AVIR	2017-08-09 13:30:00	2017-09-06 15:50:00
BNSO	2017-08-09 14:05:00	2017-09-05 16:00:00
CODA	2017-08-09 14:30:00	2017-09-06 15:07:00
CPST	2017-08-09 13:30:00	2017-09-06 15:57:00
CYAN	2017-08-10 13:30:00	2017-09-06 14:33:00
DTRM	2017-08-09 13:30:00	2017-09-06 15:57:00
FORD	2017-08-09 13:30:00	2017-09-06 15:40:00
GBR	2017-08-09 13:38:00	2017-09-06 15:58:00
JCS	2017-08-09 13:30:00	2017-09-06 15:53:00
JVA	2017-08-09 14:04:00	2017-09-06 14:46:00
LEU	2017-08-09 13:30:00	2017-09-06 15:42:00
LRAD	2017-08-09 13:30:00	2017-09-06 15:07:00
MICR	2017-08-09 14:05:00	2017-09-06 15:34:00
MOC	2017-08-09 14:27:00	2017-09-06 15:34:00
PAVM	2017-08-09 15:49:00	2017-09-06 15:52:00
REFR	2017-08-09 13:30:00	2017-09-06 15:57:00
SEAC	2017-08-09 13:45:00	2017-09-06 15:58:00
UTSI	2017-08-09 13:45:00	2017-09-06 15:50:00
VSR	2017-08-09 13:30:00	2017-09-06 15:56:00
WSTL	2017-08-09 13:30:00	2017-09-06 15:19:00
WTT	2017-08-09 14:14:00	2017-09-06 15:57:00
ZDGE	2017-08-09 13:30:00	2017-09-06 14:37:00