In [724]:
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline
In [725]:
def get_stock_data(ticker, seconds_interval, num_of_days):
goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)
r = requests.get(goog_fin_url)
#print r.status_code
#print r.headers
#print r.content[:200]
# get data and convert to data frame
stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
# rename column name
stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
# remove 'a' from unix timestamps
stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
# get entire column and convert to ints
time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
# keep track of current timestamp
last_timestamp = time_indices[0]
# convert unix timestamp abbreviations into full unix timestamps
for i in range(len(time_indices)):
if time_indices[i] < last_timestamp:
time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
else:
last_timestamp = time_indices[i]
# convert unix timestamps to human-readable formats
time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
# print first and last
print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
# keep 5-minute times (i.e., no dates)
times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
# create new column in data frame
stock_df['TIMEINDEX'] = times
# keep date
dates = [x[:10] for x in time_indices]
# create new column in data frame
stock_df['DATE'] = dates
# remove values
#print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
#for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
# stock_df = stock_df[stock_df['TIMEINDEX'] != x]
print "shape:", stock_df.shape
return stock_df
In [726]:
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()
Out[726]:
In [727]:
num_of_cols = set()
for date in stock_X_df['DATE'].unique():
num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
min_num_of_cols = min(num_of_cols)
print min_num_of_cols
In [728]:
X = np.array([])
for date in stock_X_df['DATE'].unique():
if X.shape[0] == 0:
X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
else:
X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])
X = np.delete(X, -1, axis=0)
print X.shape
pd.DataFrame(X).head()
Out[728]:
In [729]:
seconds_interval = "86400" # daily: 86400
stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_y_df.head()
Out[729]:
In [730]:
y = (stock_y_df['HIGH'].values)
#y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
y = y[1:]
print y.shape
y[:5]
Out[730]:
In [731]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.1, random_state=42)
In [732]:
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)
Out[732]:
In [733]:
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred
In [734]:
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred
In [741]:
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)
In [736]:
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)
In [737]:
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)
Out[737]:
In [738]:
pd.DataFrame(features_test)
Out[738]:
In [739]:
pd.DataFrame(labels_test)
Out[739]:
In [740]:
pd.DataFrame(pred)
Out[740]: