In [1]:
import numpy as np
import pandas as pd
import pprint
In [2]:
project_dir = r'/Users/hudson/Code/marketModel/'
In [3]:
stock_data = pd.read_hdf(project_dir + 'data/stock_data/tagged_stock_data.hdf', 'table')
stock_data.head()
Out[3]:
In [4]:
minmax_times = stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times
Out[4]:
In [5]:
min_time, max_time = minmax_times.agg({'amin': np.max, 'amax': np.min})
print min_time, max_time
In [6]:
unindexed_stock_data = stock_data.reset_index()
unindexed_stock_data = unindexed_stock_data.loc[(min_time <= unindexed_stock_data.timestamp) & (unindexed_stock_data.timestamp <= max_time)].set_index('timestamp')
print "Stock data shape: " + str(stock_data.shape)
print "Filtered data shape: " + str(unindexed_stock_data.shape)
In [7]:
minmax_times = unindexed_stock_data.reset_index().groupby('ticker').agg({'timestamp': (np.min, np.max)}).timestamp
minmax_times
Out[7]:
In [8]:
stock_data = unindexed_stock_data
In [9]:
# They should all have the same number of rows and columns now.
# Rows are minutes and columns are stock prices and tagging info
num_minutes = []
for name, group in stock_data.groupby('ticker'):
print name, group.shape
num_minutes.append(group.shape[0])
num_minutes = int(np.unique(num_minutes))
print num_minutes
In [10]:
# Looks good. Let's now make sure that the stock data is sorted by ticker and timestampe
stock_data_sorted = stock_data.reset_index().sort_values(['ticker', 'timestamp']).set_index('timestamp')
X = stock_data_sorted.loc[:,'ticker':'volume']
Y = stock_data_sorted.loc[:,('ticker','goal_met')]
In [11]:
print X.shape[0]/26
print Y.shape
In [12]:
def rolling_window(a, step):
shape = a.shape[:-1] + (a.shape[-1] - step + 1, step)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
In [13]:
window_size = 360
num_features = 5
# Shape the X array
X_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1,
window_size,
num_features * X.ticker.unique().size))
for idx, (name, group) in enumerate(X.groupby('ticker')):
group_mx = group.loc[:,'open':'volume'].transpose().as_matrix()
X_shaped_array[:,:,num_features*idx:(num_features * (idx+1))] = \
np.rollaxis(rolling_window(group_mx, window_size), 0, 3)
In [14]:
print X_shaped_array.shape
print X_shaped_array[0, 0:3, 0:5]
print X.head(3)
In [15]:
# Shape the Y array
Y_shaped_array = np.zeros((X.shape[0]/X.ticker.unique().size - window_size + 1, X.ticker.unique().size))
for idx, (name, group) in enumerate(Y.groupby('ticker')):
group_mx = group.loc[:,'goal_met'].as_matrix()
Y_shaped_array[:,idx] = group_mx[window_size-1:]
In [16]:
print Y_shaped_array.shape
print (Y_shaped_array==1).sum()
print Y.groupby('ticker').agg({'goal_met': lambda x: x.iloc[window_size-1:].sum()}).agg(np.sum)
In [17]:
9317./10218./26.
Out[17]:
In [18]:
np.savez(project_dir + 'data/trainingData.npz', X=X_shaped_array, Y=Y_shaped_array)