In [3]:
import requests
import csv
import re
import numpy as np
import pandas as pd
from scipy import stats
import datetime
import Quandl
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import RandomizedPCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [4]:
def get_stock_data(ticker, seconds_interval, num_of_days):
url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)
# get data and convert to data frame
stock_df = pd.read_csv(url, skiprows=[0,1,2,3,5,6])
# rename column name
stock_df.rename(columns={'COLUMNS=DATE':'time'}, inplace=True)
# remove 'a' from unix timestamps
stock_df.replace(to_replace={'time':{'a':''}}, regex=True, inplace=True)
# get entire column and convert to ints
time_indices = [int(x) for x in stock_df['time'].values]
# keep track of current timestamp
last_timestamp = time_indices[0]
# convert unix timestamp abbreviations into full unix timestamps
for i in range(len(time_indices)):
if time_indices[i] < last_timestamp:
time_indices[i] = last_timestamp + (time_indices[i] * seconds_interval)
else:
last_timestamp = time_indices[i]
# convert unix timestamps to human-readable formats
time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
# keep times (i.e., not dates)
times = [float(x[-8:-3].replace(':','.')) for x in time_indices]
# create new column in data frame
stock_df['time'] = times
# keep day of month
#dates = [int(x[:10].split('-')[2]) for x in time_indices]
# create new column in data frame
#stock_df['month_date'] = dates
# get weekday as int value
#stock_df['week_day'] = [datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday() for x in time_indices]
# create features
stock_df['op_cl%'] = np.true_divide((stock_df['CLOSE'] - stock_df['OPEN']), stock_df['CLOSE'])
stock_df['lo_hi%'] = np.true_divide((stock_df['HIGH'] - stock_df['LOW']), stock_df['HIGH'])
stock_df['vol_norm'] = np.true_divide(stock_df['VOLUME'], np.max(stock_df['VOLUME']))
# create labels dataframe
labels_df = stock_df.copy(deep=True)
# remove columns
stock_df = stock_df.drop(['CLOSE', 'OPEN', 'LOW', 'HIGH', 'VOLUME'], axis=1)
#print stock_df.shape
#stock_df.head()
return stock_df, labels_df
In [5]:
tickers = ['ARNA',
'ATNM',
'AVXL',
'AXN',
'BLFS',
'BOTA',
'CBLI',
'CPRX',
'DARA',
'ECYT',
'EDAP',
'EXAS',
'HALO',
'IDRA',
'INO',
'LOXO',
'LPCN',
'MEIP',
'MNKD',
'OREX',
'PGNX',
'QLTI',
'RMTI',
'SGYP',
'SNGX',
'SPY',
'SYN',
'TENX',
'THLD',
'TNXP',
'TPIV']
In [6]:
# download data
for ticker in tickers:
seconds_interval = 1800 # 1800: 30-minute (seems the most consistent)
stock_df, labels_df = get_stock_data(ticker, seconds_interval, 1000)
stock_df.to_csv("goog_data/{}_features.csv".format(ticker), index=False)
labels_df.to_csv("goog_data/{}_labels.csv".format(ticker), index=False)
In [442]:
# import data
X = []
y = []
for ticker in tickers:
stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))
num_of_times = stock_df['time'].unique().shape[0]
stock_df = stock_df.drop('time', axis=1)
assert num_of_times == 14, "wrong number of times"
#print "number of times in a day: {}".format(num_of_times)
num_of_days = stock_df.shape[0]/num_of_times
#print "number of days: {}".format(num_of_days)
for i in xrange(num_of_days):
# features
features = stock_df.values[:num_of_times].flatten()
features = np.expand_dims(features, axis=0)
#assert features.shape[1] == 84, "wrong number of columns"
# combine features into rows of X
if X == []:
X = features
else:
X = np.vstack((X, features))
# labels
labels = labels_df.values[:num_of_times].flatten()
# (last - open) / last
label = np.true_divide((labels[-8] - labels[1]), labels[-8])
# make class
label = int(label*100)
# make binary class
def binarize(label):
if label >= 5:
label = 1
else:
label = 0
return label
label = binarize(label)
if y == []:
y = np.array([label])
else:
y = np.append(y, np.array(label))
# remove used rows (go to next day)
stock_df = stock_df[num_of_times:]
labels_df = labels_df[num_of_times:]
# rotate/discard rows
X = X[:-1]
y = y[1:]
print "\n", "*"*10, "\nfinal shapes: ", X.shape, y.shape
In [443]:
plt.hist(y, bins=20, alpha=0.7, color='r')
plt.show()
In [444]:
y_values = np.unique(y, return_counts=True)[0]
print y_values
In [445]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes
In [446]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
classes_to_remove.append(y_values[i])
print classes_to_remove
In [447]:
# single-class removal
rows_to_remove = []
for i in xrange(len(y)):
if y[i] in classes_to_remove:
rows_to_remove.append(i)
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)
print X.shape, y.shape
In [448]:
# outlier-label removal
'''
rows_to_remove = []
for i in xrange(len(y)):
if y[i] < -10 or y[i] > 10:
rows_to_remove.append(i)
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)
print X.shape, y.shape
'''
Out[448]:
In [449]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]
In [450]:
corr_df = pd.DataFrame(X)
corr_df['label'] = y
np.max(corr_df.corr()['label'].values[:-1])
Out[450]:
In [451]:
#for i in xrange(X.shape[1]):
# plt.scatter(X[:,i], y)
# plt.show()
In [452]:
threshold = 10
if np.unique(y, return_counts=True)[0].shape[0] > 2:
threshold = 3
skb = SelectKBest()
skb = skb.fit(X,y)
counter = 0
for i in xrange(skb.scores_.shape[0]):
if skb.scores_[i] > threshold:
counter += 1
print i, skb.scores_[i]
print "\nfeatures scoring higher than {}: ".format(threshold), counter
In [453]:
skb = SelectKBest(k=counter)
skb = skb.fit(X,y)
X = skb.transform(X)
X.shape
Out[453]:
In [ ]:
#clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers.
#clf = DecisionTreeClassifier()
clf = SVC()
#pipeline = make_pipeline(clf)
pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), clf)
# cross validation
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)
# tune parameters
params = dict()
# for PCA
params['randomizedpca__iterated_power'] = [1, 2, 3]
params['randomizedpca__n_components'] = [2, 4, 6, 8, 10]
params['randomizedpca__random_state'] = [42]
params['randomizedpca__whiten'] = [True, False]
if str(clf)[0] == 'D':
params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
params['decisiontreeclassifier__class_weight'] = ['auto', None]
params['decisiontreeclassifier__random_state'] = [42]
if str(clf)[0] == 'S':
# [2**x for x in np.arange(-15, 15+1, 3)]
params['svc__C'] = [10, 100, 1000]
params['svc__gamma'] = [0.1, 0.2, 0.5, 1.0]
params['svc__random_state'] = [42]
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)
grid_search.fit(X, y)
clf = grid_search.best_estimator_
print clf
print "\nscore: ", grid_search.best_score_
clf = clf.fit(X,y)
y_pred = clf.predict(X)
print "\n", classification_report(y, y_pred)
In [455]:
### labels as binary
# GaussianNB
## default (pipeline, stratifiedshufflesplit, gridsearch): 0.831379310345
## skb (k=11): 0.853448275862
## standardscaler: 0.853448275862
## default pca: 0.848965517241
## pca gridsearched: 0.880344827586
###-> iterated_power=1, n_components=2, whiten=True
## defaults for following classifiers:
###-> skb (k=11), pipeline [standardscaler, pca], stratifiedshufflesplit, gridsearch
# DecisionTreeClassifier
## default: 0.845517241379
## params gridsearched: 0.849655172414
###-> class_weight='auto', criterion='gini', max_features=None
# SVC
## default: 0.910689655172
## params gridsearched: too long
In [456]:
### labels as ints, quasi-normally distributed
# GaussianNB
## default (pipeline, stratifiedshufflesplit, gridsearch): 0.0996515679443
## skb (k=10): 0.249128919861
## standardscaler: 0.249128919861
## default pca: 0.256445993031
## pca gridsearched: 0.27456445993 #-> iterated_power=1, n_components=2, whiten=True
## defaults for following classifiers:
###-> skb (k=10), pipeline [standardscaler, pca], stratifiedshufflesplit, gridsearch
# DecisionTreeClassifier
## default: 0.150174216028
## params gridsearched: 0.15331010453
###-> class_weight=None, criterion='gini', max_features='auto'
# SVC
## default: 0.286411149826
## params gridsearched: too long
In [457]:
#for i in xrange(X_train.shape[1]):
# plt.scatter(X_train[:,i], y_train)
# plt.show()
In [458]:
'''
num_rows = X_train.shape[0]
num_cols = X_train.shape[1]
rows_to_remove = set()
for i in xrange(num_cols):
low = np.percentile(X_train[:,i], 5)
high = np.percentile(X_train[:,i], 95)
for j in xrange(num_rows):
if X_train[j,i] == low or X_train[j,i] == high:
rows_to_remove.add(j)
X_train = np.delete(X_train, list(rows_to_remove), axis=0)
y_train = np.delete(y_train, list(rows_to_remove))
print X_train.shape, y_train.shape
'''
Out[458]:
In [459]:
'''
regr_or_clf = "clf"
if regr_or_clf == 'regr':
#poly = PolynomialFeatures(degree=3)
#poly.fit(X_train,y_train)
#X_train = poly.transform(X_train)
#X_test = poly.transform(X_test)
high_score = -9999999
best_y_pred = []
for regr in [LinearRegression(), Ridge(), Lasso(), BayesianRidge(), SGDRegressor(), SVR(), DecisionTreeRegressor(), GradientBoostingRegressor()]:
regr = regr.fit(X_train, y_train)
score = regr.score(X_test, y_test)
print score
if score > high_score:
high_score = score
best_y_pred = regr.predict(X_test)
elif regr_or_clf == 'clf':
best_f1_score = -9999999
best_clf = []
best_y_pred = []
mean_y_pred = []
for clf in [GaussianNB(), DecisionTreeClassifier(random_state=42), RandomForestClassifier(), SVC(C=100, kernel='rbf', gamma=10, random_state=42)]:
#for clf in [SVC(C=1000, kernel='rbf', gamma=1, random_state=42)]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf_report = classification_report(y_test, y_pred)
clf_report = [re.sub(r"[a-z]|\n", '', x) for x in clf_report.split(" ")]
clf_report = filter(None, clf_report)
f1_score = float(clf_report[-2])
print f1_score
if f1_score > best_f1_score:
best_f1_score = f1_score
best_clf = clf
best_y_pred = y_pred
#print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)
#print "\nclassification report:\n", classification_report(y_test, y_pred)
print str(best_clf)[:3]
print classification_report(y_test, best_y_pred)
'''
Out[459]:
In [460]:
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X, hue='labels')
#plt.show()
In [461]:
plt.hist(y, color='b', alpha=0.7)
plt.hist(y_pred, color='y', alpha=0.7)
plt.show()
In [462]:
plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
plt.show()
In [463]:
y_pred - y
Out[463]:
In [464]:
np.sum(y)
Out[464]:
In [465]:
error_count = 0
for i in xrange(len(y)):
if y_pred[i] != y[i]:
error_count += 1
print error_count, " / ", len(y)
In [466]:
# import data
new_X = []
for ticker in tickers:
stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))
num_of_times = stock_df['time'].unique().shape[0]
stock_df = stock_df.drop('time', axis=1)
assert num_of_times == 14, "wrong number of times"
#print "number of times in a day: {}".format(num_of_times)
# features
features = stock_df.values[(-1*num_of_times):].flatten()
features = np.expand_dims(features, axis=0)
# combine features into rows of X
if new_X == []:
new_X = features
else:
new_X = np.vstack((new_X, features))
print "\n", "*"*10, "\nfinal shapes: ", new_X.shape
In [467]:
new_pred = clf.predict(new_X)
for i in xrange(len(tickers)):
print i, tickers[i], new_pred[i]
'''
0 ARNA 0
1 ATNM 0
2 AVXL 0
3 AXN 0
4 BLFS 0
5 BOTA 1
6 CBLI 0
7 CPRX 0
8 DARA 0
9 ECYT 0
10 EDAP 0
11 EXAS 0
12 HALO 0
13 IDRA 0
14 INO 0
15 LOXO 0
16 LPCN 0
17 MEIP 0
18 MNKD 0
19 OREX 0
20 PGNX 1
21 QLTI 0
22 RMTI 0
23 SGYP 0
24 SNGX 0
25 SPY 0
26 SYN 1
27 TENX 0
28 THLD 0
29 TNXP 0
30 TPIV 1
'''