In [1]:
# Setup (Imports)
from sklearn.utils import shuffle
from datetime import datetime, timedelta
import numpy as np
import os
import matplotlib.pyplot as plt
In [2]:
# Load CSV
def csv_as_numpy(stock):
"""
Loads csv file as a np array
CSV -> 2d [samples, features]
"""
days, day_values = [], []
with open(os.path.join('..', 'data', stock + '.csv'), 'r') as data:
for line in data:
if len(line) > 6 and "Date" not in line and "null" not in line:
items = line.split(",")
days.append(items[0])
day_values.append( np.array( list(map(float, items[1:])) ) )
return days, np.array(day_values) # dates, 2d array
def headline_word2vec_csv_as_numpy(stock, emb_size=100, sentence_length=12):
"""
Loads csv file as a np array
CSV -> 3d [samples, wordlen, wordvec]
"""
days, day_values = [], []
with open(os.path.join('..', 'data', stock + '-headlines-vectors.csv'), 'r') as data:
for line in data:
if len(line) > 6:
day, uneven_vector = line[:10], line[11:]
uneven_vector = np.array(eval(uneven_vector))
even_vector = np.zeros((sentence_length, emb_size))
even_vector[:uneven_vector.shape[0], :] = uneven_vector[:sentence_length, :]
days.append(day)
day_values.append(even_vector)
return days, np.array(day_values)
def headline_doc2vec_csv_as_numpy(stock):
"""
Loads csv file as a np array
CSV -> 2d [samples, doc2vec]
"""
days, day_values = [], []
with open(os.path.join('..', 'data', stock + '-headlines-vectors.csv'), 'r') as data:
for line in data:
if len(line) > 6:
day, vector = line[:10], line[11:]
vector = np.array(eval(vector))
days.append(day)
day_values.append(vector)
return days, np.array(day_values)
In [3]:
# Make Data
def create_timeframed_close_regression_data(stock, window_size, window_skip=0, norm=False):
"""
Timeframe Close Regression
Creates data for a close-only based price regression from last `window_size` days
"""
data = csv_as_numpy(stock)[1][:, 3]
X, Y = [], []
for i in range(len(data) - window_size - 1):
time_frame = np.copy(data[i: i + window_size + 1])
if norm:
time_frame -= np.mean(time_frame[:-1])
time_frame /= np.std(time_frame)
X.append(time_frame[:-1 - window_skip])
Y.append(time_frame[-1])
return np.array(X), np.array(Y)
def create_timeframed_alldata_classification_data(stock, window_size, norm=True, output='up/down'):
"""
Timeframe Alldata Classification
Creates data for prediction of stock up/down from all stock features for given `window_size`
"""
data = csv_as_numpy(stock)[1][:, (0,1,2,4,5)] # OPEN HIGH LOW close ADJ_CLOSE VOLUME
X, Y = [], []
for i in range(1, len(data) - window_size - 1):
time_frame = np.copy(data[i: i + window_size + 1])
current_close = time_frame[-1, 3]
last_close = time_frame[-2, 3]
if norm:
time_frame -= np.mean(time_frame[:-1], axis=0)
time_frame /= np.std(time_frame[:-1], axis=0)
X.append(time_frame[:-1])
if output == 'up/down':
if last_close < current_close:
Y.append([1., 0.])
else:
Y.append([0., 1.])
elif output == '+-1':
if last_close < current_close:
Y.append(+1.)
else:
Y.append(-1.)
return np.array(X), np.array(Y)
def create_timeframed_word2vec_classification_data(stock, window_size):
days1, histstock_data = csv_as_numpy(stock)
days2, headlines_data = headline_word2vec_csv_as_numpy(stock)
histstock_data = histstock_data[:, 4] # Close
X, Y = [], []
for i in range(1, len(headlines_data) - window_size - 1):
headline_timeframe = np.copy(headlines_data[i: i + window_size])
window_end_date = days2[i + window_size]
try:
histstock_index = days1.index(window_end_date)
last_close = histstock_data[histstock_index]
current_close = histstock_data[histstock_index + 1]
if last_close < current_close:
Y.append([1., 0.])
else:
Y.append([0., 1.])
X.append(headline_timeframe)
except (ValueError, IndexError):
pass
return np.array(X), np.array(Y)
def create_timeframed_doc2vec_classification_data(stock, window_size, min_time_disparity=3, norm=True):
def parse_date(date):
return datetime(int(date[:4]), int(date[5:7]), int(date[8:]))
days1, histstock_data = csv_as_numpy(stock)
days2, headlines_data = headline_doc2vec_csv_as_numpy(stock)
stock_data = csv_as_numpy(stock)[1][:, (0,1,2,4,5)]
histstock_data = histstock_data[:, 4] # Close
X, Y = [], []
for i in range(1, len(headlines_data) - window_size - 1):
headline_timeframe = np.copy(headlines_data[i: i + window_size])
window_end_date = days2[i + window_size]
## Check timeframe disparity ##
valid = True
days_timeframe = list(days2[i: i + window_size])
max_diff = timedelta(days=min_time_disparity)
for a, b in zip(days_timeframe, days_timeframe[1:]):
if parse_date(b) - parse_date(a) > max_diff:
valid = False
break
if not valid:
continue
###############################
try:
## Find close price ##
end_date = parse_date(window_end_date)
delta = 0
while not end_date.strftime('%Y-%m-%d') in days1 and delta < 5:
end_date = end_date + timedelta(days=1)
delta += 1
if delta >= 5:
continue
histstock_index = days1.index(end_date.strftime('%Y-%m-%d'))
if histstock_index + 1 >= len(histstock_data):
continue
last_close = histstock_data[histstock_index]
current_close = histstock_data[histstock_index + 1]
######################
if last_close < current_close:
Y.append([1., 0.])
else:
Y.append([0., 1.])
if norm:
headline_timeframe /= 0.015 # Hardcoded stddev
X.append(headline_timeframe)
except (ValueError, IndexError):
pass
return np.array(X), np.array(Y)
def create_timeframed_doc2vec_ticker_classification_data(stock, window_size_ticker, window_size_headlines, min_time_disparity=3, norm=True):
def parse_date(date):
return datetime(int(date[:4]), int(date[5:7]), int(date[8:]))
days1, histstock_data = csv_as_numpy(stock)
days2, headlines_data = headline_doc2vec_csv_as_numpy(stock)
ticker_data = histstock_data[:, (0,1,2,4,5)] # OPEN HIGH LOW close ADJ_CLOSE VOLUME
X, X2, Y = [], [], []
for i in range(1, len(headlines_data) - window_size_headlines - 1):
headline_timeframe = np.copy(headlines_data[i: i + window_size_headlines])
window_end_date = days2[i + window_size_headlines]
## Check timeframe disparity ##
valid = True
days_timeframe = list(days2[i: i + window_size_headlines])
max_diff = timedelta(days=min_time_disparity)
for a, b in zip(days_timeframe, days_timeframe[1:]):
if parse_date(b) - parse_date(a) > max_diff:
valid = False
break
if not valid:
continue
###############################
try:
## Find close price ##
end_date = parse_date(window_end_date)
delta = 0
while not end_date.strftime('%Y-%m-%d') in days1 and delta < 5:
end_date = end_date + timedelta(days=1)
delta += 1
if delta >= 5:
continue
histstock_index = days1.index(end_date.strftime('%Y-%m-%d'))
if histstock_index + 1 >= len(histstock_data):
continue
last_close = histstock_data[histstock_index, 4]
current_close = histstock_data[histstock_index + 1, 4]
######################
if last_close < current_close:
Y.append([1., 0.])
else:
Y.append([0., 1.])
stock_timeframe = np.copy(ticker_data[histstock_index - window_size_ticker:histstock_index])
if norm:
headline_timeframe /= 0.015 # Hardcoded stddev
stock_timeframe -= np.mean(stock_timeframe[:-1], axis=0)
stock_timeframe /= np.std(stock_timeframe[:-1], axis=0)
X.append(headline_timeframe)
X2.append(stock_timeframe)
except (ValueError, IndexError):
pass
return np.array(X), np.array(X2), np.array(Y)
In [4]:
# Split Data
def split_data(X, Y, ratio=.8, mix=True):
"""
Splits X/Y to Train/Test
"""
train_size = int(len(X) * ratio)
trainX, testX = X[:train_size], X[train_size:]
trainY, testY = Y[:train_size], Y[train_size:]
if mix:
trainX, trainY = shuffle(trainX, trainY, random_state=0)
return trainX, trainY, testX, testY
def split_data2(X, X2, Y, ratio=.8, mix=True):
"""
Splits X/Y to Train/Test
"""
train_size = int(len(X) * ratio)
trainX, testX = X[:train_size], X[train_size:]
trainX2, testX2 = X2[:train_size], X2[train_size:]
trainY, testY = Y[:train_size], Y[train_size:]
if mix:
indexes = np.arange(trainX.shape[0])
np.random.shuffle(indexes)
trainX = trainX[indexes]
trainX2 = trainX2[indexes]
trainY = trainY[indexes]
return trainX, trainX2, trainY, testX, testX2, testY
In [5]:
# Run (Test)
if __name__ == "__main__":
closing = []
high, low = [], []
for values in csv_as_numpy('AAPL')[1]:
closing.append(values[3])
high.append(values[1])
low.append(values[2])
plt.plot(closing[-50:])
plt.plot(high[-50:])
plt.plot(low[-50:])
plt.show()