In [1]:
import requests
import csv
import re
import numpy as np
import pandas as pd
from scipy import stats
import datetime
from time import time
import Quandl
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline
import tickers
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import RandomizedPCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [2]:
download = False
print len(tickers.tickers), "tickers"
if download == True:
# download data
for ticker in tickers.tickers:
stock_df = Quandl.get("YAHOO/{}".format(ticker), authtoken='DVhizWXNTePyzzy1eHWR')
stock_df.to_csv("quandl_data/{}.csv".format(ticker), index=False)
In [3]:
def modify_columns(ticker, normalize):
df = pd.read_csv("quandl_data/{}.csv".format(ticker))
df = df.drop('Adjusted Close', axis=1)
df['50dravg'] = pd.rolling_mean(df['Close'], window=50)
df['200dravg'] = pd.rolling_mean(df['Close'], window=200)
if normalize == True:
temp_df = df['Volume']
df = df.drop('Volume', axis=1)
df = df.std(axis=1, ddof=0)
df['mean'] = df.mean(axis=1)
df['std'] = std_df
df['Open'] = (df['Open'] - df['mean']) / df['std']
df['High'] = (df['High'] - df['mean']) / df['std']
df['Low'] = (df['Low'] - df['mean']) / df['std']
df['Close'] = (df['Close'] - df['mean']) / df['std']
df['50dravg'] = (df['50dravg'] - df['mean']) / df['std']
df['200dravg'] = (df['200dravg'] - df['mean']) / df['std']
df = df.drop(['mean', 'std'], axis=1)
df['Volume'] = temp_df
df['OC%'] = (df['Close'] / df['Open']) - 1
df['HL%'] = (df['High'] / df['Low']) - 1
df['label'] = df['OC%'].shift(-1)
return df #df.loc[1000:]
In [4]:
normalize = False
scale_volume = False
binarize = True
# import data
stock_df = pd.DataFrame()
for ticker in tickers.tickers:
if stock_df.empty:
stock_df = modify_columns(ticker, normalize)
else:
stock_df = stock_df.append(modify_columns(ticker, normalize))
#stock_df = pd.concat([stock_df, modify_columns(ticker, normalize)])
#stock_df = pd.concat([stock_df, modify_columns(ticker, normalize)], verify_integrity=True)
# scale volume
if scale_volume == True:
stock_df['Volume'] = (stock_df['Volume'] - stock_df['Volume'].min()) / (stock_df['Volume'].max() - stock_df['Volume'].min())
# log volume
#stock_df['Volume'] = stock_df['Volume'].map(lambda x: np.log(x))
#stock_df = stock_df.drop(['Open', 'High', 'Low', 'Close'], axis=1)
stock_df = stock_df.replace([np.inf, -np.inf], np.nan)
prediction_df = stock_df.copy()
stock_df = stock_df.dropna()
# binarize labels
if binarize == True:
stock_df['label'] = stock_df['label'].map(lambda x: 1 if x >= 0.05 else 0)
print stock_df.shape
stock_df.tail()
Out[4]:
In [5]:
stock_df.describe()
Out[5]:
In [6]:
stock_df.Volume.hist(bins=50)
Out[6]:
In [7]:
for i in xrange(len(stock_df.columns)):
print i, stock_df.columns[i], stock_df.corr()['label'].values[i]
In [8]:
negative_df = stock_df[stock_df['label'] == 0]
positive_df = stock_df[stock_df['label'] == 1]
plt.scatter(negative_df['200dravg'], negative_df['Volume'])
plt.scatter(positive_df['200dravg'], positive_df['Volume'], color='r')
plt.show()
In [9]:
x, y, z = '200dravg', '50dravg', 'Close'
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(negative_df[x], negative_df[y], negative_df[z], alpha=0.1, color='y')
ax.scatter(positive_df[x], positive_df[y], positive_df[z], color='r')
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
ax.view_init(azim=260)
plt.show()
In [10]:
y = stock_df['label'].values
stock_df = stock_df.drop('label', axis=1)
X = stock_df.values
print X.shape, y.shape
In [11]:
plt.hist(y, bins=50, alpha=0.7, color='r')
plt.show()
In [12]:
y_values = np.unique(y, return_counts=True)[0]
print y_values.shape, "\n"
print y_values
In [13]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes
print "percent 1: ", np.true_divide(num_of_classes[1],np.sum(num_of_classes))
In [14]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
classes_to_remove.append(y_values[i])
print len(classes_to_remove)
print classes_to_remove[:5]
print classes_to_remove[-5:]
In [15]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]
In [16]:
#for i in xrange(X.shape[1]):
# plt.scatter(X[:,i], y)
# plt.show()
In [17]:
#for i in xrange(X.shape[1]):
# plt.hist(X[:,i])
# plt.show()
In [18]:
skb = []
best_learner = []
#for k in xrange(1,X.shape[1]+1):
for k in xrange(8,9):
print "\n", "#"*50, "\n"
skb = SelectKBest(k=k)
skb = skb.fit(X,y)
new_X = skb.transform(X)
t0 = time()
clf_or_regr = "clf"
pca = False
############################################################################
if clf_or_regr == "regr":
# add bias
new_X = np.hstack((new_X, np.ones((new_X.shape[0], 1))))
# single-class removal
rows_to_remove = []
for i in xrange(len(y)):
if y[i] in classes_to_remove:
rows_to_remove.append(i)
new_X = np.delete(new_X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)
############################################################################
learner = []
pipeline = []
############################################################################
if clf_or_regr == "clf":
#learner = GaussianNB()
learner = DecisionTreeClassifier()
#learner = RandomForestClassifier()
#learner = SVC()
#learner = LogisticRegression()
if clf_or_regr == "regr":
learner = LinearRegression()
#learner = Ridge()
#learner = Lasso()
#learner = BayesianRidge()
#learner = SGDRegressor()
#learner = SVR()
#learner = DecisionTreeRegressor()
#learner = GradientBoostingRegressor()
############################################################################
if pca == False:
pipeline = make_pipeline(learner)
elif pca == True:
pipeline = make_pipeline(RandomizedPCA(), learner)
# cross validation
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)
# tune parameters
params = dict()
# for PCA
if pca == True:
#params['randomizedpca__iterated_power'] = [1, 2, 3]
#params['randomizedpca__n_components'] = [2, 4, 6, 8, 10]
params['randomizedpca__random_state'] = [42]
#params['randomizedpca__whiten'] = [True, False]
############################################################################
if clf_or_regr == "clf":
if str(learner)[0] == 'X':
params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
params['decisiontreeclassifier__class_weight'] = ['auto', None]
params['decisiontreeclassifier__random_state'] = [42]
if str(learner)[0] == 'S':
# [2**x for x in np.arange(-15, 15+1, 3)]
params['svc__C'] = np.logspace(-2, 10, 13)
params['svc__gamma'] = np.logspace(-9, 3, 13)
params['svc__random_state'] = [42]
############################################################################
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)
grid_search.fit(new_X, y)
best_learner = grid_search.best_estimator_
print best_learner
print "\nscore: ", grid_search.best_score_
best_learner = best_learner.fit(new_X,y)
y_pred = best_learner.predict(new_X)
############################################################################
if clf_or_regr == "clf":
print "\nconfusion matrix:\n", " FALSE TRUE\n", "FALSE", confusion_matrix(y, y_pred)[0], "\nTRUE ", confusion_matrix(y, y_pred)[1]
print "\n", classification_report(y, y_pred)
print "\nprice data normalized:", normalize
print "\nvolume scaled:", scale_volume
print "\nbinarized:", binarize
print "\nk =", k
print "\nPCA:", pca
print "\nminutes for learner to run:", round((time()-t0)/60.0, 3)
print
if k == 1:
for i in xrange(len(stock_df.columns)):
print stock_df.columns[i], np.round(skb.scores_[i], 2)
In [19]:
#X_df = pd.DataFrame(X[:,:4])
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X_df, hue='labels')
#plt.show()
In [20]:
#plt.hist(y, color='b', alpha=0.7)
#plt.hist(y_pred, color='y', alpha=0.7)
#plt.show()
In [21]:
#plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
#plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
#plt.show()
In [22]:
#y_pred - y
In [23]:
#np.sum(y)
In [24]:
#error_count = 0
#for i in xrange(len(y)):
# if y_pred[i] != y[i]:
# error_count += 1
#
#print error_count, " / ", len(y)
In [25]:
prediction_df[prediction_df['label'].apply(np.isnan) == True].shape
Out[25]:
In [26]:
prediction_X = prediction_df[prediction_df['label'].apply(np.isnan) == True].drop('label', axis=1).values
print prediction_X.shape, "\n"
prediction_X = skb.transform(prediction_X)
y_pred = best_learner.predict(prediction_X)
for i in xrange(len(tickers.tickers)):
print i, tickers.tickers[i], y_pred[i]
In [27]:
'''
new_X = stock_df.values
new_pred = learner.predict(new_X)
for i in xrange(len(tickers)):
print "[{}]".format(i), tickers[i], new_pred[i]
'''
Out[27]:
In [28]:
# http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
# http://scikit-learn.org/stable/auto_examples/semi_supervised/plot_label_propagation_versus_svm_iris.html