notebook.community

Edit and run



In [153]:

    
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline



In [154]:

    
def get_stock_data(ticker, seconds_interval, num_of_days):
    goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    r = requests.get(goog_fin_url)
    #print r.status_code
    #print r.headers
    #print r.content[:200]

    # get data and convert to data frame
    stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
    # keep track of current timestamp
    last_timestamp = time_indices[0]
    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
        else:
            last_timestamp = time_indices[i]
    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    # print first and last
    print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
    
    # keep 5-minute times (i.e., no dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
    # create new column in data frame
    stock_df['TIMEINDEX'] = times
    
    # keep date
    dates = [x[:10] for x in time_indices]
    # create new column in data frame
    stock_df['DATE'] = dates
    
    # remove values 
    #print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
    #for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
    #    stock_df = stock_df[stock_df['TIMEINDEX'] != x]
    print "shape:", stock_df.shape
    
    return stock_df



In [155]:

    
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()









    



first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)






    Out[155]:






  
    
      
      TIMEINDEX
      CLOSE
      HIGH
      LOW
      OPEN
      VOLUME
      DATE
    
  
  
    
      0
      9.3
      6.1100
      6.11
      6.1100
      6.11
      250
      2015-04-10
    
    
      1
      10.0
      6.1200
      6.13
      6.1100
      6.12
      1400
      2015-04-10
    
    
      2
      10.3
      6.0999
      6.12
      6.0901
      6.12
      5087
      2015-04-10
    
    
      3
      11.0
      6.1000
      6.12
      6.1000
      6.10
      2640
      2015-04-10
    
    
      4
      11.3
      6.1200
      6.13
      6.1200
      6.13
      300
      2015-04-10



In [156]:

    
def create_data(ticker):
    seconds_interval = "1800" # 300: 5-minute
    num_of_days = "50"
    stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)

    num_of_cols = set()
    for date in stock_X_df['DATE'].unique():
        num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
    min_num_of_cols = min(num_of_cols)
    print min_num_of_cols

    X = np.array([])
    for date in stock_X_df['DATE'].unique():
        if X.shape[0] == 0:
            X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
        else:
            X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])

    X = np.delete(X, -1, axis=0)
    #print X.shape
    #print pd.DataFrame(X).head()

    seconds_interval = "86400" # daily: 86400
    stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)

    y = (stock_y_df['HIGH'].values)
    #y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
    y = y[1:]
    #print y.shape
    #print y[:5]
    
    return X, y



In [157]:

    
tickers = ["TNXP", "NVDA"] # TNXP, LPTH, NVDA, BLFS
X = np.array([])
y = np.array([])
for ticker in tickers:
    if X.shape[0] == 0:
        X, y = create_data(ticker)
    else:
        newX, newy = create_data(ticker)
        X = np.vstack([X, newX])
        y = np.hstack([y, newy])









    



first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)
78
first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)
first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (699, 7)
78
first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)



In [158]:

    
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [159]:

    
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)









    Out[159]:





0.99845958254593303



In [160]:

    
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred



In [161]:

    
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred



In [162]:

    
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)



In [163]:

    
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)



In [164]:

    
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)









    Out[164]:





0.99116887537044107



In [165]:

    
pd.DataFrame(features_test).head()









    Out[165]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
    
  
  
    
      0
      9.3
      22.14
      22.15
      22.14
      22.15
      29741
      10
      22.2700
      22.3600
      22.05
      ...
      22.295
      22.210
      22.22
      247314
      15.3
      22.180
      22.260
      22.13
      22.260
      339140
    
    
      1
      9.3
      7.71
      7.71
      7.71
      7.71
      1571
      10
      7.9400
      7.9471
      7.77
      ...
      8.150
      8.040
      8.05
      26390
      15.3
      8.170
      8.180
      8.12
      8.120
      20036
    
    
      2
      9.3
      21.00
      21.01
      21.00
      21.00
      78069
      10
      20.8700
      21.0500
      20.87
      ...
      21.060
      20.990
      21.05
      284326
      15.3
      20.995
      21.005
      20.96
      21.000
      360872
    
    
      3
      9.3
      6.13
      6.13
      6.13
      6.13
      7000
      10
      6.1000
      6.1000
      6.04
      ...
      5.920
      5.881
      5.92
      2193
      15.3
      5.940
      5.940
      5.92
      5.920
      602
    
    
      4
      9.3
      20.87
      20.87
      20.87
      20.87
      68318
      10
      21.0999
      21.1600
      20.80
      ...
      21.860
      21.650
      21.75
      1639869
      15.3
      21.849
      21.935
      21.73
      21.795
      1487868
    
  

5 rows × 78 columns



In [166]:

    
pd.DataFrame(labels_test).head()



In [167]:

    
pd.DataFrame(pred).head()



In [168]:

    
labels_test - pred









    Out[168]:





array([-0.055 , -1.85  ,  0.22  ,  0.08  ,  1.0489,  0.2401,  0.19  ,
       -1.7   , -0.758 ,  0.07  ,  0.13  ,  0.105 ,  0.4501,  0.1084,
       -0.6099, -0.11  ,  0.05  , -0.3819, -0.42  ,  0.315 ])

	0
0	22.3050
1	8.6000
2	21.3600
3	6.2100
4	22.2139

	0
0	22.360
1	10.450
2	21.140
3	6.130
4	21.165

	TIMEINDEX	CLOSE	HIGH	LOW	OPEN	VOLUME	DATE
0	9.3	6.1100	6.11	6.1100	6.11	250	2015-04-10
1	10.0	6.1200	6.13	6.1100	6.12	1400	2015-04-10
2	10.3	6.0999	6.12	6.0901	6.12	5087	2015-04-10
3	11.0	6.1000	6.12	6.1000	6.10	2640	2015-04-10
4	11.3	6.1200	6.13	6.1200	6.13	300	2015-04-10

	0	1	2	3	4	5	6	7	8	9	...	68	69	70	71	72	73	74	75	76	77
0	9.3	22.14	22.15	22.14	22.15	29741	10	22.2700	22.3600	22.05	...	22.295	22.210	22.22	247314	15.3	22.180	22.260	22.13	22.260	339140
1	9.3	7.71	7.71	7.71	7.71	1571	10	7.9400	7.9471	7.77	...	8.150	8.040	8.05	26390	15.3	8.170	8.180	8.12	8.120	20036
2	9.3	21.00	21.01	21.00	21.00	78069	10	20.8700	21.0500	20.87	...	21.060	20.990	21.05	284326	15.3	20.995	21.005	20.96	21.000	360872
3	9.3	6.13	6.13	6.13	6.13	7000	10	6.1000	6.1000	6.04	...	5.920	5.881	5.92	2193	15.3	5.940	5.940	5.92	5.920	602
4	9.3	20.87	20.87	20.87	20.87	68318	10	21.0999	21.1600	20.80	...	21.860	21.650	21.75	1639869	15.3	21.849	21.935	21.73	21.795	1487868