In [1]:
import os
import requests
import csv
import re
import numpy as np
import pandas as pd
from scipy import stats
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
def get_stock_data(ticker, seconds_interval, num_of_days):
url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)
# get data and convert to data frame
stock_df = pd.read_csv(url, skiprows=[0,1,2,3,5,6])
# rename column name
stock_df.rename(columns={'COLUMNS=DATE':'time'}, inplace=True)
# remove 'a' from unix timestamps
stock_df.replace(to_replace={'time':{'a':''}}, regex=True, inplace=True)
# get entire column and convert to ints
time_indices = [int(x) for x in stock_df['time'].values]
# keep track of current timestamp
last_timestamp = time_indices[0]
# convert unix timestamp abbreviations into full unix timestamps
for i in range(len(time_indices)):
if time_indices[i] < last_timestamp:
time_indices[i] = last_timestamp + (time_indices[i] * seconds_interval)
else:
last_timestamp = time_indices[i]
# convert unix timestamps to human-readable formats
time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
# keep times (i.e., not dates)
times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
# create new column in data frame
stock_df['time'] = times
# keep day of month
#dates = [int(x[:10].split('-')[2]) for x in time_indices]
# create new column in data frame
#stock_df['month_date'] = dates
# get weekday as int value
#stock_df['week_day'] = [datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday() for x in time_indices]
# create features
stock_df['op_cl%'] = np.true_divide((stock_df['CLOSE'] - stock_df['OPEN']), stock_df['CLOSE'])
stock_df['lo_hi%'] = np.true_divide((stock_df['HIGH'] - stock_df['LOW']), stock_df['HIGH'])
stock_df['vol_norm'] = np.true_divide(stock_df['VOLUME'], np.max(stock_df['VOLUME']))
# create labels dataframe
labels_df = stock_df.copy(deep=True)
# remove columns
stock_df = stock_df.drop(['CLOSE', 'OPEN', 'LOW', 'HIGH', 'VOLUME'], axis=1)
#print stock_df.shape
#stock_df.head()
return stock_df, labels_df
In [3]:
tickers = [filename[:-4] for filename in os.listdir('quandl_data')]
In [4]:
# download data
remove_tickers = []
for ticker in tickers:
seconds_interval = 1800 # 1800: 30-minute (seems the most consistent)
try:
stock_df, labels_df = get_stock_data(ticker, seconds_interval, 1000)
stock_df.to_csv("goog_data/{}_features.csv".format(ticker), index=False)
labels_df.to_csv("goog_data/{}_labels.csv".format(ticker), index=False)
except:
print "problem with", ticker
remove_tickers.append(ticker)
In [5]:
[tickers.remove(ticker) for ticker in remove_tickers]
print len(tickers), tickers
In [6]:
# import data
X = []
y = []
pred_X = []
for ticker in tickers:
stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))
if ticker == tickers[0]:
print stock_df.head()
print stock_df.tail()
num_of_times = stock_df['time'].unique().shape[0]
stock_df = stock_df.drop('time', axis=1)
assert num_of_times == 14, "wrong number of times, got {0} for {1}".format(num_of_times, ticker)
#print "number of times in a day: {}".format(num_of_times)
num_of_days = stock_df.shape[0]/num_of_times
#print "number of days: {}".format(num_of_days)
for i in xrange(num_of_days):
# features
features = stock_df.values[:num_of_times].flatten()
features = np.expand_dims(features, axis=0)
#assert features.shape[1] == 84, "wrong number of columns"
# combine features into rows of X
if X == []:
X = features
else:
X = np.vstack((X, features))
# labels
labels = labels_df.values[:num_of_times].flatten()
# (last - open) / last
label = np.true_divide((labels[-8] - labels[1]), labels[-8])
# make class
label = int(label*100)
# make binary class
def binarize(label):
if label >= 5.0:
label = 1
else:
label = 0
return label
label = binarize(label)
if y == []:
y = np.array([label])
else:
y = np.append(y, np.array(label))
# remove used rows (go to next day)
stock_df = stock_df[num_of_times:]
labels_df = labels_df[num_of_times:]
# keep last for prediction
if pred_X == []:
pred_X = X[-1:]
else:
pred_X = np.vstack((pred_X, X[-1:]))
# rotate/discard rows
X = X[:-1]
y = y[1:]
print "\n", "*"*10, "\nfinal shapes: ", X.shape, y.shape, pred_X.shape
In [7]:
np.round(X[0], 5)
Out[7]:
In [8]:
plt.hist(y, bins=20, alpha=0.7, color='r')
plt.show()
In [9]:
pred_X[0]
Out[9]:
In [10]:
def add_bias(matrix):
if matrix.ndim == 1:
matrix = matrix.reshape(matrix.shape[0], 1)
return np.hstack((np.ones((matrix.shape[0], 1)), matrix))
In [11]:
def g(z):
return np.true_divide(1, (1+np.exp(-z)))
In [12]:
def g_prime(z):
return np.multiply(z, (1 - z))
In [13]:
def J(m, L, s, thetas, predictions, y, lambda_reg):
J_sum = 0
for i in xrange(m):
J_sum += y[i] * np.log(predictions[i]) + (1 - y[i]) * np.log(1 - predictions[i])
left_side = np.true_divide(J_sum, -m)
theta_sums = 0
for l in xrange(L-1):
# s_j
for i in xrange(s[l]):
# s_jplus1
for j in xrange(s[l+1]):
# (skip bias term)
theta_sums += thetas[l][:,1:][j][i]**2
right_side = np.true_divide(lambda_reg, 2.0*m) * theta_sums
return left_side + right_side
lambda_reg = 100
#J(m, L, s, thetas, predictions, y, lambda_reg)
In [14]:
def create_thetas(L, s):
init_seed_value = 42
thetas = []
for l in xrange(L-1):
np.random.seed(init_seed_value+1)
s_j = s[l]
s_jplus1 = s[l+1]
theta = np.random.standard_normal((s_jplus1,s_j+1))
thetas.append(theta)
return thetas
In [15]:
X = add_bias(X)
In [17]:
n = X.shape[1]-1
# number of layers
L = 4
# number of units in each layer
s = [n, n, n, 1]
thetas = create_thetas(L, s)
[theta.shape for theta in thetas]
Out[17]:
In [31]:
def forward_propagation(a1, thetas):
a2 = g(a1.dot(thetas[0].T))
a2 = add_bias(a2)
z2 = a2.dot(thetas[1].T)
a3 = g(z2)
a3 = add_bias(a3)
z3 = a3.dot(thetas[2].T)
a4 = g(z3)
return a2, z2, a3, z3, a4
In [41]:
def back_propagation(m, X, thetas, y, lambda_reg):
predictions = []
layers = []
Delta2 = []
Delta3 = []
for i in xrange(m):
a1 = X[i,:].reshape(1,n+1)
a2, z2, a3, z3, a4 = forward_propagation(a1, thetas)
predictions.append(a4)
d4 = a4 - y[i]
d3 = np.multiply(thetas[2].T.dot(d4), g_prime(z3))
d2 = np.multiply(thetas[1].dot(d3), g_prime(z2))
if Delta2 == []:
Delta2 = (d3 * a2.T)
else:
Delta2 = Delta2 + (d3 * a2.T)
if Delta3 == []:
Delta3 = (d4 * a3.T)
else:
Delta3 = Delta3 + (d4 * a3.T)
D2 = np.true_divide(Delta2, m)
D3 = np.true_divide(Delta3, m)
# (skip bias term for regularization)
#D2[:,1:] = D2[:,1:] + (lambda_reg * thetas[1][:,1:])
return D2, D3, predictions
#back_propagation(m, X, thetas, predictions, y, lambda_reg)
In [46]:
m = X.shape[0]
thetas = create_thetas(L, s)
#print [theta.shape for theta in thetas]
costs = []
alpha = -0.3
iterations = 50
# gradient descent
for i in xrange(iterations):
if i % 10 == 0:
print i
D2, D3, predictions = back_propagation(m, X, thetas, y, lambda_reg)
cost = J(m, L, s, thetas, predictions, y, lambda_reg)
costs.append(cost[0])
thetas[1] = np.subtract(thetas[1], np.multiply(alpha, D2).T)
thetas[2] = np.subtract(thetas[2], np.multiply(alpha, D3).T)
plt.plot([x for x in xrange(iterations)], costs)
plt.show()
In [20]:
print [theta.shape for theta in thetas]
thetas
Out[20]:
In [21]:
pred_X = add_bias(pred_X)
pred_X[0]
Out[21]:
In [22]:
for i in xrange(pred_X.shape[0]):
a1 = X[i,:].reshape(1,n+1)
a2, a3 = forward_propagation(a1, thetas)
print str(i).rjust(2), str(tickers[i]).rjust(4), np.round(a3[0][0], 2)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
positive_tickers = []
for i in xrange(len(pred_tickers)):
print i, pred_tickers[i], y_predictions[i]
if y_predictions[i] == 1:
positive_tickers.append(pred_tickers[i])
In [ ]:
for ticker in positive_tickers:
past_days = 100
oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
num_days = oc.shape[0]
day_range = np.arange(num_days)
plt.plot(day_range, oc, alpha=0.5)
plt.plot(day_range, [0.05 for x in day_range], color='r')
plt.title("{0} (previous {1} days)".format(ticker, num_days))
plt.show()
print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
print "~"*50, "\n"
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#errors = []
iterations = 1
for i in xrange(iterations):
#plt.plot([x for x in xrange(iterations)], errors)
#plt.show()
In [36]:
X = np.array([[1,2,3,4],
[1,3,4,5],
[1,6,7,8],
[1,5,4,3],
[1,2,3,3]])
print X.shape
theta1 = np.array([[2,2,3,4],
[4,3,4,5],
[6,6,7,8]])
print theta1.T
print theta1.shape
print X.dot(theta1.T)
print X.dot(theta1.T).shape