In [1]:
import urllib3
import json
import datetime
import time
import threading
import timeit
import pandas as pd
import pickle as pk
from time import strftime,strptime,gmtime
import datetime
import os
import numpy as np
import scipy as sp
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
# Visualization libraries
# Displays plots in output cell
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
from sklearn.linear_model import PassiveAggressiveClassifier,PassiveAggressiveRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
In [3]:
def positive_negative (v):
if v >= 0:
return 1
else:
return 0
In [11]:
def html_downloader(symbol,days=60,days_ago=0):
#date variables
dt = datetime.datetime.now()
UnixTime = int(time.mktime(dt.timetuple()))
#web variables
url = 'https://query1.finance.yahoo.com/v8/finance/chart/'+symbol+'?period1='+str(UnixTime-86400*(days+days_ago))+'&period2='+str(UnixTime-86400*days_ago)+'&interval=1d&indicators=quote%7Csma~50&includePrePost=true&events=div%7Csplit%7Cearn&lang=en-CA®ion=CA&corsDomain=ca.finance.yahoo.com'
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive'}
#html data request
try:
http = urllib3.PoolManager()
request = http.request('GET', url,headers = hdr)
data = json.loads(request.data.decode('utf-8'))
return data
except Exception as e:
print(e)
pass
In [13]:
def learning_downloader (symbol,days = 365,shift_days = 10):
data = html_downloader(symbol,days = days)
try:
#json unpack
timestamp = data['chart']['result'][0]['timestamp']
timestamp = [datetime.datetime.fromtimestamp(x).strftime('%Y%m%d') for x in timestamp]
quote = data['chart']['result'][0]['indicators']['quote'][0]
sma50 = data['chart']['result'][0]['indicators']['sma'][0]['sma']
stock_df = pd.DataFrame(quote)
#index is symbol and timestamp
stock_df.index = [symbol + str(x) for x in timestamp]
#moving averages
stock_df['sma'] = sma50
stock_df['vol20'] = stock_df['volume'].rolling(window=20).mean()
#predict on yesterday's averages
stock_df['sma'] = stock_df['sma'].shift(1)
stock_df['vol20'] = stock_df['vol20'].shift(1)
stock_df = stock_df.dropna()
#derived columns
stock_df['c_0'] = stock_df['close']/stock_df['sma']
stock_df['h_0'] = stock_df['high']/stock_df['sma']
stock_df['l_0'] = stock_df['low']/stock_df['sma']
stock_df['o_0'] = stock_df['open']/stock_df['sma']
stock_df['v_0'] = stock_df['volume']/stock_df['vol20']
stock_df['r'] = (stock_df['close']-stock_df['open'])/stock_df['open']
#rolling columns
for d in list(range(1,shift_days+1)):
stock_df['c_'+str(d)] = stock_df['c_0'].shift(d)
stock_df['h_'+str(d)] = stock_df['h_0'].shift(d)
stock_df['l_'+str(d)] = stock_df['l_0'].shift(d)
stock_df['o_'+str(d)] = stock_df['o_0'].shift(d)
stock_df['v_'+str(d)] = stock_df['v_0'].shift(d)
stock_df.drop(['close', 'high', 'low', 'open','volume','sma','vol20'], axis=1, inplace=True)
#do not use current day's intraday data
#stock_df = stock_df.drop(stock_df.index[len(stock_df)-1])
#clean data
stock_df = stock_df.dropna()
stock_df = stock_df[~(stock_df == np.inf).any(axis=1)]
if len(stock_df) != 0:
with open('./data_s/'+symbol+'.p','wb') as f:
pk.dump(stock_df,f)
print (stock_df)
except:
pass
In [29]:
def stock_predict (symbol,clf,clfR,shift_days=10,days_ago = 0):
try:
data = html_downloader(symbol,days_ago = days_ago)
#json unpack
timestamp = data['chart']['result'][0]['timestamp']
timestamp = [datetime.datetime.fromtimestamp(x).strftime('%Y%m%d') for x in timestamp]
quote = data['chart']['result'][0]['indicators']['quote'][0]
sma50 = data['chart']['result'][0]['indicators']['sma'][0]['sma']
stock_df = pd.DataFrame(quote)
#index is symbol and timestamp
stock_df.index = [symbol + str(x) for x in timestamp]
#moving averages
stock_df['sma'] = sma50
stock_df['vol20'] = stock_df['volume'].rolling(window=20).mean()
#predict on yesterday's averages
stock_df['sma'] = stock_df['sma'].shift(1)
stock_df['vol20'] = stock_df['vol20'].shift(1)
stock_df = stock_df.dropna()
#derived columns
stock_df['c_0'] = stock_df['close']/stock_df['sma']
stock_df['h_0'] = stock_df['high']/stock_df['sma']
stock_df['l_0'] = stock_df['low']/stock_df['sma']
stock_df['o_0'] = stock_df['open']/stock_df['sma']
stock_df['v_0'] = stock_df['volume']/stock_df['vol20']
#rolling columns
for d in list(range(1,shift_days+1)):
stock_df['c_'+str(d)] = stock_df['c_0'].shift(d)
stock_df['h_'+str(d)] = stock_df['h_0'].shift(d)
stock_df['l_'+str(d)] = stock_df['l_0'].shift(d)
stock_df['o_'+str(d)] = stock_df['o_0'].shift(d)
stock_df['v_'+str(d)] = stock_df['v_0'].shift(d)
stock_df.drop(['close', 'high', 'low', 'open','volume','sma','vol20','c_0'], axis=1, inplace=True)
stock_df['v_0'] = (stock_df['v_1']+stock_df['v_2']+stock_df['v_3']+stock_df['v_4']+stock_df['v_5'])/5
stock_df['h_0'] = (stock_df['h_1']+stock_df['h_2']+stock_df['h_3']+stock_df['h_4']+stock_df['h_5'])/5
stock_df['l_0'] = (stock_df['l_1']+stock_df['l_2']+stock_df['l_3']+stock_df['l_4']+stock_df['l_5'])/5
stock_df = stock_df.dropna()
stock_df = stock_df[~(stock_df == np.inf).any(axis=1)]
except:
pass
#select last row of downloaded matrix for latest day open price and rolling data
return '%s will go %s by %s, prediction date: %s'%(symbol, str(clf.predict(stock_df.iloc[[-1]].values.tolist())),str(clfR.predict(stock_df.iloc[[-1]].values.tolist())),timestamp[-1])
In [6]:
#download learning data from list of stocks
stock_list = ['fas','faz', 'AA', 'ABT', 'ABX', 'ADI', 'ADM', 'AET', 'AMD', 'AMR', 'APC', 'AVP', 'AXP', 'BA', 'BAC', 'BAX', 'BBY', 'BK', 'BMC', 'BMY', 'BNI', 'BP', 'CA', 'CAT', 'CI', 'CL', 'COP', 'CVX', 'DD', 'DE', 'DIS', 'DOW', 'EK', 'EMC', 'EMR', 'FNM', 'FRE', 'FRX', 'GE', 'GLW', 'GPS', 'GSK', 'HAL', 'HD', 'HON', 'HPQ', 'HRB', 'IBM', 'IGT', 'JNJ', 'JPM', 'JWN', 'KO', 'KR', 'LLY', 'LOW', 'LTD', 'LUV', 'MCD', 'MDT', 'MMM', 'MO', 'MOT', 'MRK', 'MRO', 'MU', 'MYL', 'NKE', 'NSM', 'NWS', 'OXY', 'PEP', 'PFE', 'PG', 'RSH', 'SLB', 'SLE', 'SLM', 'STJ', 'SYK', 'SYY', 'TGT', 'TJX', 'TMX', 'TXN', 'UN', 'UNH', 'UTX', 'VOD', 'VZ', 'WAG', 'WFC', 'WMB', 'WMT', 'XOM', 'XRX']
#stock_list = ['^GDAXI','^FTSE','^DJI','^FCHI','^N225','^HSI','^AXJO','^GSPC','^IXIC','^TNX','^VIX']
#with open("symbols_A.txt") as symbol_file:
#stock_list = symbol_file.read().split('\n')
threadlist = []
for s in stock_list:
t = threading.Thread(target = learning_downloader,args=(s,))
t.start()
threadlist.append(t)
#sets top limit of active threads to 50
while threading.activeCount()>50:
a=0
#print threading.activeCount()
for b in threadlist:
b.join()
print ("# of threads: ", len(threadlist))
In [45]:
#load and combine downloaded data for initial learning
ml_df = pd.DataFrame()
directory = './data_s'
for filename in os.listdir(directory):
if filename.endswith('.p'):
stock_df = pd.read_pickle(os.path.join(directory, filename))
if len(ml_df) == 0:
#using current bit of data for initial learning
ml_df = stock_df
else:
ml_df = ml_df.append(stock_df)
#not making decision on intraday close high low and volumes
ml_df = ml_df.drop(['c_0'], axis=1)
#regression learning data prep
Xr = np.array(ml_df.drop(['r'],1))
yr = np.array(ml_df['r'])
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr,yr,test_size= 0.2)
#classification data prep
ml_df['r'] = ml_df['r'].map(positive_negative)
X = np.array(ml_df.drop(['r'],1))
y = np.array(ml_df['r'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)
#Gradiant Boosting Ensemble learning
clf = GradientBoostingClassifier(learning_rate= 0.1, n_estimators= 100, max_depth= 5).fit(X_train,y_train)
print ('classification score: %s'%(clf.score(X_test, y_test)))
clfP = GradientBoostingClassifier (learning_rate= 0.1, n_estimators= 100, max_depth= 5).fit(X,y)
with open('init_mplClassifier.p','wb') as f:
pk.dump(clfP,f)
#GBE regression learning
clfR = GradientBoostingRegressor(learning_rate= 0.01, n_estimators= 100, max_depth= 5).fit(Xr_train,yr_train)
print ('regression score: %s'%(clfR.score(Xr_test, yr_test)))
clfPR = GradientBoostingRegressor (learning_rate= 0.01, n_estimators= 100, max_depth= 5).fit(Xr,yr)
with open('init_mplRegressor.p','wb') as f:
pk.dump(clfPR,f)
In [ ]:
#partial fit
clfP = pk.load(open('init_mplClassifier.p','rb'))
clfPR = pk.load(open('init_mplRegressor.p','rb'))
#adjusting hyper parameters for partial fit
clfP.learning_rate='invscaling'
clfP.warm_start = True
clfP.learning_rate_init=0.0001
clfP.max_iter=2000
clfP.tol=1e-6
clfP.momentum=0.7
clfP.validation_fraction=0.2
clfPR.learning_rate='invscaling'
clfPR.warm_start = True
clfPR.learning_rate_init=0.0001
clfPR.max_iter=2000
clfPR.tol=1e-6
clfPR.momentum=0.7
clfPR.validation_fraction=0.2
directory = './data_s'
for filename in os.listdir(directory):
if filename.endswith('.p'):
stock_df = pk.load(open(os.path.join(directory, filename),'rb'))
stock_df = stock_df.drop(['c_0','v_0','h_0','l_0'], axis=1)
stock_df = stock_df.dropna()
stock_df = stock_df[~(stock_df == np.inf).any(axis=1)]
if len(stock_df) != 0:
try:
Xpr = np.array(stock_df.drop(['r'],1))
ypr = np.array(stock_df['r'])
Xpr, ypr = shuffle(Xpr,ypr,random_state=0)
clfPR.partial_fit(Xpr,ypr)
stock_df['r'] = stock_df['r'].map(positive_negative)
Xp = np.array(stock_df.drop(['r'],1))
yp = np.array(stock_df['r'])
Xp, yp = shuffle(Xp,yp,random_state=0)
clfP.partial_fit(Xp,yp, classes=np.array([0, 1]))
except:
pass
with open('mlpClassifier.p','wb') as f:
pk.dump(clfP,f)
with open('mlpRegressor.p','wb') as f:
pk.dump(clfPR,f)
print 'classification score: %s'%(clfP.score(X_train, y_train))
print 'regression score: %s'%(clfPR.score(Xr_train, yr_train))
In [8]:
#prediction testing
dt = datetime.datetime.now()
UnixTime = int(time.mktime(dt.timetuple()))
symbol = 'tndm'
clf = pk.load(open('init_mplClassifier.p','rb'))
clfR = pk.load(open('init_mplRegressor.p','rb'))
for d in list(range(0,3))[::-1]:
print (stock_predict(symbol,clf,clfR,days_ago=d))
#web variables
url = 'https://query1.finance.yahoo.com/v8/finance/chart/'+symbol+'?period1='+str(UnixTime-86400*5)+'&period2='+str(UnixTime)+'&interval=1d&indicators=quote%7Csma~50&includePrePost=true&events=div%7Csplit%7Cearn&lang=en-CA®ion=CA&corsDomain=ca.finance.yahoo.com'
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive'}
#html data request
request = urllib2.Request(url,headers = hdr)
htmltext = urllib2.urlopen(request)
data = json.load(htmltext)
#json unpack
timestamp = data['chart']['result'][0]['timestamp']
timestamp = [datetime.datetime.fromtimestamp(x).strftime('%Y%m%d') for x in timestamp]
quote = data['chart']['result'][0]['indicators']['quote'][0]
stock_df = pd.DataFrame(quote)
#index is symbol and timestamp
stock_df.index = [symbol + str(x) for x in timestamp]
stock_df = stock_df.dropna()
stock_df['r'] = (stock_df['close']-stock_df['open'])/stock_df['open']
print stock_df
In [ ]: