notebook.community

Edit and run



In [724]:

    
import requests
import csv
import numpy as np
import pandas as pd
import datetime
import Quandl
from matplotlib import pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import tree
%matplotlib inline



In [725]:

    
def get_stock_data(ticker, seconds_interval, num_of_days):
    goog_fin_url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    r = requests.get(goog_fin_url)
    #print r.status_code
    #print r.headers
    #print r.content[:200]

    # get data and convert to data frame
    stock_df = pd.read_csv(goog_fin_url, skiprows=[0,1,2,3,5,6])
    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'TIMEINDEX'}, inplace=True)
    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'TIMEINDEX':{'a':''}}, regex=True, inplace=True)
    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['TIMEINDEX'].values]
    # keep track of current timestamp
    last_timestamp = time_indices[0]
    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * int(seconds_interval))
        else:
            last_timestamp = time_indices[i]
    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    # print first and last
    print "first: {}".format(time_indices[0]), "last: {}".format(time_indices[-1])
    
    # keep 5-minute times (i.e., no dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
    # create new column in data frame
    stock_df['TIMEINDEX'] = times
    
    # keep date
    dates = [x[:10] for x in time_indices]
    # create new column in data frame
    stock_df['DATE'] = dates
    
    # remove values 
    #print "remove lesser values after final (max): ", stock_df['TIMEINDEX'].unique()[-6:-1]
    #for x in stock_df['TIMEINDEX'].unique()[-5:-1]:
    #    stock_df = stock_df[stock_df['TIMEINDEX'] != x]
    print "shape:", stock_df.shape
    
    return stock_df



In [726]:

    
ticker = "TNXP" # TNXP, LPTH, NVDA, BLFS
seconds_interval = "1800" # 300: 5-minute
num_of_days = "50"
stock_X_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_X_df.head()









    



first: 2015-04-10 09:30:00 last: 2015-06-19 16:00:00
shape: (693, 7)






    Out[726]:






  
    
      
      TIMEINDEX
      CLOSE
      HIGH
      LOW
      OPEN
      VOLUME
      DATE
    
  
  
    
      0
      9.3
      6.1100
      6.11
      6.1100
      6.11
      250
      2015-04-10
    
    
      1
      10.0
      6.1200
      6.13
      6.1100
      6.12
      1400
      2015-04-10
    
    
      2
      10.3
      6.0999
      6.12
      6.0901
      6.12
      5087
      2015-04-10
    
    
      3
      11.0
      6.1000
      6.12
      6.1000
      6.10
      2640
      2015-04-10
    
    
      4
      11.3
      6.1200
      6.13
      6.1200
      6.13
      300
      2015-04-10



In [727]:

    
num_of_cols = set()
for date in stock_X_df['DATE'].unique():
    num_of_cols.add(stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten().shape[0])
min_num_of_cols = min(num_of_cols)
print min_num_of_cols



In [728]:

    
X = np.array([])
for date in stock_X_df['DATE'].unique():
    if X.shape[0] == 0:
        X = stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]
    else:
        X = np.vstack([X, stock_X_df[stock_X_df['DATE'] == date].drop('DATE', axis=1).values.flatten()[:min_num_of_cols]])

X = np.delete(X, -1, axis=0)
print X.shape
pd.DataFrame(X).head()









    



(49, 78)






    Out[728]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
    
  
  
    
      0
      9.3
      6.11
      6.11
      6.11
      6.11
      250
      10.0
      6.12
      6.1300
      6.110
      ...
      6.210
      6.19
      6.19
      1100
      15.3
      6.20
      6.23
      6.20
      6.22
      1400
    
    
      1
      9.3
      6.22
      6.22
      6.22
      6.22
      1005
      10.0
      6.33
      6.3300
      6.141
      ...
      6.233
      6.16
      6.23
      6188
      15.3
      6.17
      6.22
      6.15
      6.19
      6900
    
    
      2
      9.3
      6.18
      6.18
      6.18
      6.18
      957
      10.0
      6.20
      6.2876
      6.180
      ...
      6.210
      6.21
      6.21
      100
      15.3
      6.21
      6.22
      6.20
      6.21
      2900
    
    
      3
      10.0
      6.18
      6.18
      6.18
      6.18
      200
      10.3
      6.21
      6.2100
      6.190
      ...
      6.359
      6.30
      6.34
      3250
      16.0
      6.25
      6.33
      6.25
      6.33
      4118
    
    
      4
      9.3
      6.29
      6.29
      6.29
      6.29
      500
      10.0
      6.30
      6.3000
      6.270
      ...
      6.200
      6.18
      6.19
      2000
      15.3
      6.15
      6.18
      6.14
      6.16
      9369
    
  

5 rows × 78 columns



In [729]:

    
seconds_interval = "86400" # daily: 86400
stock_y_df = get_stock_data(ticker, seconds_interval, num_of_days)
stock_y_df.head()









    



first: 2015-04-10 16:00:00 last: 2015-06-19 16:00:00
shape: (50, 7)






    Out[729]:






  
    
      
      TIMEINDEX
      CLOSE
      HIGH
      LOW
      OPEN
      VOLUME
      DATE
    
  
  
    
      0
      16
      6.19
      6.2300
      6.0900
      6.11
      46441
      2015-04-10
    
    
      1
      16
      6.15
      6.4700
      6.1410
      6.22
      80132
      2015-04-13
    
    
      2
      16
      6.21
      6.2876
      6.1700
      6.18
      28147
      2015-04-14
    
    
      3
      16
      6.25
      6.4000
      6.1800
      6.18
      30968
      2015-04-15
    
    
      4
      16
      6.23
      6.3000
      6.1106
      6.29
      69650
      2015-04-16



In [730]:

    
y = (stock_y_df['HIGH'].values)
#y = (stock_y_df['HIGH'].values - stock_y_df['OPEN'].values)
y = y[1:]
print y.shape
y[:5]









    



(49,)






    Out[730]:





array([ 6.47  ,  6.2876,  6.4   ,  6.3   ,  6.27  ])



In [731]:

    
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.1, random_state=42)



In [732]:

    
clf = BayesianRidge()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)









    Out[732]:





0.88482434911981933



In [733]:

    
#clf = GaussianNB()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred



In [734]:

    
#clf = SVC(kernel="linear")
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#acc = accuracy_score(pred, labels_test)
#print acc, pred



In [741]:

    
#clf = SVR(kernel='rbf', C=10000.0)
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)



In [736]:

    
#clf = tree.DecisionTreeClassifier()
#clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)
#clf.score(features_test, labels_test)



In [737]:

    
clf = tree.DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
clf.score(features_test, labels_test)









    Out[737]:





0.90088366360621641



In [738]:

    
pd.DataFrame(features_test)









    Out[738]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
    
  
  
    
      0
      10.0
      6.27
      6.31
      6.25
      6.31
      1300
      10.3
      6.3000
      6.30
      6.200
      ...
      6.2363
      6.2200
      6.2363
      700
      16.0
      6.2000
      6.250
      6.17
      6.25
      2220
    
    
      1
      9.3
      8.90
      8.90
      8.90
      8.90
      13570
      10.0
      9.0000
      9.34
      8.680
      ...
      9.2370
      9.1701
      9.2000
      15979
      15.3
      9.2000
      9.240
      9.15
      9.17
      35150
    
    
      2
      9.3
      9.97
      9.97
      9.97
      9.97
      5792
      10.0
      10.0500
      10.32
      9.960
      ...
      10.4200
      10.3400
      10.3800
      25295
      15.3
      10.4300
      10.430
      10.28
      10.40
      30946
    
    
      3
      9.3
      8.60
      8.60
      8.60
      8.60
      10252
      10.0
      9.0115
      9.10
      8.400
      ...
      9.0300
      8.9300
      8.9500
      47051
      15.3
      8.8343
      9.015
      8.80
      9.00
      22472
    
    
      4
      9.3
      6.03
      6.03
      6.03
      6.03
      476
      10.0
      6.0600
      6.28
      6.035
      ...
      6.1200
      6.0700
      6.1200
      1100
      15.3
      6.0300
      6.068
      6.03
      6.05
      1200
    
  

5 rows × 78 columns



In [739]:

    
pd.DataFrame(labels_test)



In [740]:

    
pd.DataFrame(pred)

	0
0	6.4300
1	9.9499
2	10.4000
3	9.3400
4	6.1300

	0
0	6.75
1	10.45
2	10.29
3	10.45
4	6.27

	TIMEINDEX	CLOSE	HIGH	LOW	OPEN	VOLUME	DATE
0	9.3	6.1100	6.11	6.1100	6.11	250	2015-04-10
1	10.0	6.1200	6.13	6.1100	6.12	1400	2015-04-10
2	10.3	6.0999	6.12	6.0901	6.12	5087	2015-04-10
3	11.0	6.1000	6.12	6.1000	6.10	2640	2015-04-10
4	11.3	6.1200	6.13	6.1200	6.13	300	2015-04-10

	TIMEINDEX	CLOSE	HIGH	LOW	OPEN	VOLUME	DATE
0	16	6.19	6.2300	6.0900	6.11	46441	2015-04-10
1	16	6.15	6.4700	6.1410	6.22	80132	2015-04-13
2	16	6.21	6.2876	6.1700	6.18	28147	2015-04-14
3	16	6.25	6.4000	6.1800	6.18	30968	2015-04-15
4	16	6.23	6.3000	6.1106	6.29	69650	2015-04-16

	0	1	2	3	4	5	6	7	8	9	...	68	69	70	71	72	73	74	75	76	77
0	10.0	6.27	6.31	6.25	6.31	1300	10.3	6.3000	6.30	6.200	...	6.2363	6.2200	6.2363	700	16.0	6.2000	6.250	6.17	6.25	2220
1	9.3	8.90	8.90	8.90	8.90	13570	10.0	9.0000	9.34	8.680	...	9.2370	9.1701	9.2000	15979	15.3	9.2000	9.240	9.15	9.17	35150
2	9.3	9.97	9.97	9.97	9.97	5792	10.0	10.0500	10.32	9.960	...	10.4200	10.3400	10.3800	25295	15.3	10.4300	10.430	10.28	10.40	30946
3	9.3	8.60	8.60	8.60	8.60	10252	10.0	9.0115	9.10	8.400	...	9.0300	8.9300	8.9500	47051	15.3	8.8343	9.015	8.80	9.00	22472
4	9.3	6.03	6.03	6.03	6.03	476	10.0	6.0600	6.28	6.035	...	6.1200	6.0700	6.1200	1100	15.3	6.0300	6.068	6.03	6.05	1200