In [724]:
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline

In [725]:
def get_stock_data(ticker, seconds_interval, num_of_days):
    goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    r = requests.get(goog_fin_url)
    #print r.status_code
    #print r.headers
    #print r.content[:200]

    # get data and convert to data frame
    stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
    # keep track of current timestamp
    last_timestamp = time_indices[0]
    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
        else:
            last_timestamp = time_indices[i]
    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    # print first and last
    print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
    
    # keep 5-minute times (i.e., no dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
    # create new column in data frame
    stock_df['TIMEINDEX'] = times
    
    # keep date
    dates = [x[:10] for x in time_indices]
    # create new column in data frame
    stock_df['DATE'] = dates
    
    # remove values 
    #print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
    #for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
    #    stock_df = stock_df[stock_df['TIMEINDEX'] != x]
    print "shape:", stock_df.shape
    
    return stock_df

In [726]:
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()


first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)
Out[726]:
TIMEINDEX CLOSE HIGH LOW OPEN VOLUME DATE
0 9.3 6.1100 6.11 6.1100 6.11 250 2015-04-10
1 10.0 6.1200 6.13 6.1100 6.12 1400 2015-04-10
2 10.3 6.0999 6.12 6.0901 6.12 5087 2015-04-10
3 11.0 6.1000 6.12 6.1000 6.10 2640 2015-04-10
4 11.3 6.1200 6.13 6.1200 6.13 300 2015-04-10

In [727]:
num_of_cols = set()
for date in stock_X_df['DATE'].unique():
    num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
min_num_of_cols = min(num_of_cols)
print min_num_of_cols


78

In [728]:
X = np.array([])
for date in stock_X_df['DATE'].unique():
    if X.shape[0] == 0:
        X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
    else:
        X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])

X = np.delete(X, -1, axis=0)
print X.shape
pd.DataFrame(X).head()


(49, 78)
Out[728]:
0 1 2 3 4 5 6 7 8 9 ... 68 69 70 71 72 73 74 75 76 77
0 9.3 6.11 6.11 6.11 6.11 250 10.0 6.12 6.1300 6.110 ... 6.210 6.19 6.19 1100 15.3 6.20 6.23 6.20 6.22 1400
1 9.3 6.22 6.22 6.22 6.22 1005 10.0 6.33 6.3300 6.141 ... 6.233 6.16 6.23 6188 15.3 6.17 6.22 6.15 6.19 6900
2 9.3 6.18 6.18 6.18 6.18 957 10.0 6.20 6.2876 6.180 ... 6.210 6.21 6.21 100 15.3 6.21 6.22 6.20 6.21 2900
3 10.0 6.18 6.18 6.18 6.18 200 10.3 6.21 6.2100 6.190 ... 6.359 6.30 6.34 3250 16.0 6.25 6.33 6.25 6.33 4118
4 9.3 6.29 6.29 6.29 6.29 500 10.0 6.30 6.3000 6.270 ... 6.200 6.18 6.19 2000 15.3 6.15 6.18 6.14 6.16 9369

5 rows × 78 columns


In [729]:
seconds_interval = "86400" # daily: 86400
stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_y_df.head()


first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)
Out[729]:
TIMEINDEX CLOSE HIGH LOW OPEN VOLUME DATE
0 16 6.19 6.2300 6.0900 6.11 46441 2015-04-10
1 16 6.15 6.4700 6.1410 6.22 80132 2015-04-13
2 16 6.21 6.2876 6.1700 6.18 28147 2015-04-14
3 16 6.25 6.4000 6.1800 6.18 30968 2015-04-15
4 16 6.23 6.3000 6.1106 6.29 69650 2015-04-16

In [730]:
y = (stock_y_df['HIGH'].values)
#y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
y = y[1:]
print y.shape
y[:5]


(49,)
Out[730]:
array([ 6.47  ,  6.2876,  6.4   ,  6.3   ,  6.27  ])

In [731]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [732]:
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)


Out[732]:
0.88482434911981933

In [733]:
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred

In [734]:
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred

In [741]:
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)

In [736]:
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)

In [737]:
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)


Out[737]:
0.90088366360621641

In [738]:
pd.DataFrame(features_test)


Out[738]:
0 1 2 3 4 5 6 7 8 9 ... 68 69 70 71 72 73 74 75 76 77
0 10.0 6.27 6.31 6.25 6.31 1300 10.3 6.3000 6.30 6.200 ... 6.2363 6.2200 6.2363 700 16.0 6.2000 6.250 6.17 6.25 2220
1 9.3 8.90 8.90 8.90 8.90 13570 10.0 9.0000 9.34 8.680 ... 9.2370 9.1701 9.2000 15979 15.3 9.2000 9.240 9.15 9.17 35150
2 9.3 9.97 9.97 9.97 9.97 5792 10.0 10.0500 10.32 9.960 ... 10.4200 10.3400 10.3800 25295 15.3 10.4300 10.430 10.28 10.40 30946
3 9.3 8.60 8.60 8.60 8.60 10252 10.0 9.0115 9.10 8.400 ... 9.0300 8.9300 8.9500 47051 15.3 8.8343 9.015 8.80 9.00 22472
4 9.3 6.03 6.03 6.03 6.03 476 10.0 6.0600 6.28 6.035 ... 6.1200 6.0700 6.1200 1100 15.3 6.0300 6.068 6.03 6.05 1200

5 rows × 78 columns


In [739]:
pd.DataFrame(labels_test)


Out[739]:
0
0 6.4300
1 9.9499
2 10.4000
3 9.3400
4 6.1300

In [740]:
pd.DataFrame(pred)


Out[740]:
0
0 6.75
1 10.45
2 10.29
3 10.45
4 6.27