In [153]:
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline
In [154]:
def get_stock_data(ticker, seconds_interval, num_of_days):
goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)
r = requests.get(goog_fin_url)
#print r.status_code
#print r.headers
#print r.content[:200]
# get data and convert to data frame
stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
# rename column name
stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
# remove 'a' from unix timestamps
stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
# get entire column and convert to ints
time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
# keep track of current timestamp
last_timestamp = time_indices[0]
# convert unix timestamp abbreviations into full unix timestamps
for i in range(len(time_indices)):
if time_indices[i] < last_timestamp:
time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
else:
last_timestamp = time_indices[i]
# convert unix timestamps to human-readable formats
time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
# print first and last
print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
# keep 5-minute times (i.e., no dates)
times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
# create new column in data frame
stock_df['TIMEINDEX'] = times
# keep date
dates = [x[:10] for x in time_indices]
# create new column in data frame
stock_df['DATE'] = dates
# remove values
#print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
#for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
# stock_df = stock_df[stock_df['TIMEINDEX'] != x]
print "shape:", stock_df.shape
return stock_df
In [155]:
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()
Out[155]:
In [156]:
def create_data(ticker):
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
num_of_cols = set()
for date in stock_X_df['DATE'].unique():
num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
min_num_of_cols = min(num_of_cols)
print min_num_of_cols
X = np.array([])
for date in stock_X_df['DATE'].unique():
if X.shape[0] == 0:
X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
else:
X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])
X = np.delete(X, -1, axis=0)
#print X.shape
#print pd.DataFrame(X).head()
seconds_interval = "86400" # daily: 86400
stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)
y = (stock_y_df['HIGH'].values)
#y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
y = y[1:]
#print y.shape
#print y[:5]
return X, y
In [157]:
tickers = ["TNXP", "NVDA"] # TNXP, LPTH, NVDA, BLFS
X = np.array([])
y = np.array([])
for ticker in tickers:
if X.shape[0] == 0:
X, y = create_data(ticker)
else:
newX, newy = create_data(ticker)
X = np.vstack([X, newX])
y = np.hstack([y, newy])
In [158]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [159]:
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)
Out[159]:
In [160]:
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred
In [161]:
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred
In [162]:
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)
In [163]:
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)
In [164]:
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)
Out[164]:
In [165]:
pd.DataFrame(features_test).head()
Out[165]:
In [166]:
pd.DataFrame(labels_test).head()
Out[166]:
In [167]:
pd.DataFrame(pred).head()
Out[167]:
In [168]:
labels_test - pred
Out[168]: