In [153]:
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline

In [154]:
def get_stock_data(ticker, seconds_interval, num_of_days):
    goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    r = requests.get(goog_fin_url)
    #print r.status_code
    #print r.headers
    #print r.content[:200]

    # get data and convert to data frame
    stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
    # keep track of current timestamp
    last_timestamp = time_indices[0]
    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
        else:
            last_timestamp = time_indices[i]
    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    # print first and last
    print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
    
    # keep 5-minute times (i.e., no dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
    # create new column in data frame
    stock_df['TIMEINDEX'] = times
    
    # keep date
    dates = [x[:10] for x in time_indices]
    # create new column in data frame
    stock_df['DATE'] = dates
    
    # remove values 
    #print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
    #for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
    #    stock_df = stock_df[stock_df['TIMEINDEX'] != x]
    print "shape:", stock_df.shape
    
    return stock_df

In [155]:
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()


first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)
Out[155]:
TIMEINDEX CLOSE HIGH LOW OPEN VOLUME DATE
0 9.3 6.1100 6.11 6.1100 6.11 250 2015-04-10
1 10.0 6.1200 6.13 6.1100 6.12 1400 2015-04-10
2 10.3 6.0999 6.12 6.0901 6.12 5087 2015-04-10
3 11.0 6.1000 6.12 6.1000 6.10 2640 2015-04-10
4 11.3 6.1200 6.13 6.1200 6.13 300 2015-04-10

In [156]:
def create_data(ticker):
    seconds_interval = "1800" # 300: 5-minute
    num_of_days = "50"
    stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)

    num_of_cols = set()
    for date in stock_X_df['DATE'].unique():
        num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
    min_num_of_cols = min(num_of_cols)
    print min_num_of_cols

    X = np.array([])
    for date in stock_X_df['DATE'].unique():
        if X.shape[0] == 0:
            X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
        else:
            X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])

    X = np.delete(X, -1, axis=0)
    #print X.shape
    #print pd.DataFrame(X).head()

    seconds_interval = "86400" # daily: 86400
    stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)

    y = (stock_y_df['HIGH'].values)
    #y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
    y = y[1:]
    #print y.shape
    #print y[:5]
    
    return X, y

In [157]:
tickers = ["TNXP", "NVDA"] # TNXP, LPTH, NVDA, BLFS
X = np.array([])
y = np.array([])
for ticker in tickers:
    if X.shape[0] == 0:
        X, y = create_data(ticker)
    else:
        newX, newy = create_data(ticker)
        X = np.vstack([X, newX])
        y = np.hstack([y, newy])


first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)
78
first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)
first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (699, 7)
78
first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)

In [158]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [159]:
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)


Out[159]:
0.99845958254593303

In [160]:
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred

In [161]:
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred

In [162]:
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)

In [163]:
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)

In [164]:
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)


Out[164]:
0.99116887537044107

In [165]:
pd.DataFrame(features_test).head()


Out[165]:
0 1 2 3 4 5 6 7 8 9 ... 68 69 70 71 72 73 74 75 76 77
0 9.3 22.14 22.15 22.14 22.15 29741 10 22.2700 22.3600 22.05 ... 22.295 22.210 22.22 247314 15.3 22.180 22.260 22.13 22.260 339140
1 9.3 7.71 7.71 7.71 7.71 1571 10 7.9400 7.9471 7.77 ... 8.150 8.040 8.05 26390 15.3 8.170 8.180 8.12 8.120 20036
2 9.3 21.00 21.01 21.00 21.00 78069 10 20.8700 21.0500 20.87 ... 21.060 20.990 21.05 284326 15.3 20.995 21.005 20.96 21.000 360872
3 9.3 6.13 6.13 6.13 6.13 7000 10 6.1000 6.1000 6.04 ... 5.920 5.881 5.92 2193 15.3 5.940 5.940 5.92 5.920 602
4 9.3 20.87 20.87 20.87 20.87 68318 10 21.0999 21.1600 20.80 ... 21.860 21.650 21.75 1639869 15.3 21.849 21.935 21.73 21.795 1487868

5 rows × 78 columns


In [166]:
pd.DataFrame(labels_test).head()


Out[166]:
0
0 22.3050
1 8.6000
2 21.3600
3 6.2100
4 22.2139

In [167]:
pd.DataFrame(pred).head()


Out[167]:
0
0 22.360
1 10.450
2 21.140
3 6.130
4 21.165

In [168]:
labels_test - pred


Out[168]:
array([-0.055 , -1.85  ,  0.22  ,  0.08  ,  1.0489,  0.2401,  0.19  ,
       -1.7   , -0.758 ,  0.07  ,  0.13  ,  0.105 ,  0.4501,  0.1084,
       -0.6099, -0.11  ,  0.05  , -0.3819, -0.42  ,  0.315 ])