In [1]:
import sys # for gioia to load aiohttp
sys.path.append('/Users/maggiori/anaconda/envs/py35/lib/python3.5/site-packages')
In [2]:
# to import modules locally even if the package hasn't been installed
# http://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
In [3]:
import time
import signal
import subprocess
import numpy as np
from scipy.stats import norm
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
In [4]:
from timeseries import *
We have provided a year of daily closing prices for 379 S&P 500 stocks. We have explicitly excluded stocks with incomplete or missing data. We have pre-loaded 350 stocks in the database, and have excluded 29 stocks for later use in similarity searches.
Data source: www.stockwiz.com
In [5]:
# load data
with open('data/prices_include.json') as f:
stock_data_include = json.load(f)
with open('data/prices_exclude.json') as f:
stock_data_exclude = json.load(f)
# keep track of which stocks are included/excluded from the database
stocks_include = list(stock_data_include.keys())
stocks_exclude = list(stock_data_exclude.keys())
# check the number of market days in the year
num_days = len(stock_data_include[stocks_include[0]])
num_days
Out[5]:
Let's start by initializing all the database components.
In [6]:
# 1. load the database server
# when running from the terminal
# python go_server_persistent.py --ts_length 245 --db_name 'stock_prices'
# here we load the server as a subprocess for demonstration purposes
server = subprocess.Popen(['python', '../go_server_persistent.py',
'--ts_length', str(num_days), '--data_dir', '../db_files', '--db_name', 'stock_prices'])
time.sleep(5) # make sure it loads completely
# 2. load the database webserver
# when running from the terminal
# python go_webserver.py
# here we load the server as a subprocess for demonstration purposes
webserver = subprocess.Popen(['python', '../go_webserver.py'])
time.sleep(5) # make sure it loads completely
# 3. import the web interface and initialize it
from webserver import *
web_interface = WebInterface()
The database is now up and running. We have pre-loaded the data for you, but you can always unquote the code below to re-load the data if you accidentally delete it.
In [7]:
# # insert into database
# for stock in stocks_include:
# web_interface.insert_ts(pk=stock, ts=TimeSeries(range(num_days), stock_data_include[stock]))
Let's check how many stocks are currently in the database (should be 350).
In [8]:
len(web_interface.select())
Out[8]:
Let's look at the first 10 stocks, to check that the data has been loaded correctly.
In [9]:
# let's look at the first 10 stocks
web_interface.select(fields=['ts'], additional={'sort_by': '+pk', 'limit': 10})
Out[9]:
We need to initialize vantage points in order to carry out a vantage point search. Again, this has already been done for you, but you can re-create the results by running the following code.
In [10]:
# # randomly pick vantage points
# # note: this can be time-intensive for a large number of vantage points
# num_vps = 10
# random_vps = np.random.choice(len(stocks_include), size=num_vps, replace=False)
# vpkeys = [stocks_include[s] for s in random_vps]
# # mark in database
# for vp in vpkeys:
# web_interface.insert_vp(vp)
Let's pick one of our excluded stocks and carry out a vantage point similarity search.
In [11]:
# pick the stock
stock = np.random.choice(stocks_exclude)
print('Stock:', stock)
# run the vantage point similarity search
result = web_interface.vp_similarity_search(TimeSeries(range(num_days), stock_data_exclude[stock]), 1)
stock_match = list(result)[0]
stock_ts = web_interface.select(fields=['ts'], md={'pk': stock_match})[stock_match]['ts']
print('Most similar stock:', stock_match)
# visualize similarity
plt.plot(stock_data_exclude[stock], label='Query:' + stock)
plt.plot(stock_ts.values(), label='Result:' + stock_match)
plt.xticks([])
plt.legend(loc='best')
plt.title('Daily Stock Price Similarity')
plt.show()
Let's pick another one of our excluded stocks and carry out an iSAX tree similarity search. Note that this is an approximate search technique, so it will not always be able to find a similar stock.
In [12]:
# pick the stock
stock = np.random.choice(stocks_exclude)
print('Stock:', stock)
# run the isax tree similarity search
result = web_interface.isax_similarity_search(TimeSeries(range(num_days), stock_data_exclude[stock]))
# could not find a match
if result == 'ERROR: NO_MATCH':
print('Could not find a similar stock.')
# found a match
else:
# closest time series
stock_match = list(result)[0]
stock_ts = web_interface.select(fields=['ts'], md={'pk': stock_match})[stock_match]['ts']
print('Most similar stock:', stock_match)
# visualize similarity
plt.plot(stock_data_exclude[stock], label='Query:' + stock)
plt.plot(stock_ts.values(), label='Result:' + stock_match)
plt.xticks([])
plt.legend(loc='best')
plt.title('Daily Stock Price Similarity')
plt.show()
Now, let's pick one more random stock, carry out both types of similarity searches, and compare the results.
In [13]:
# pick the stock
stock = np.random.choice(stocks_exclude)
print('Stock:', stock)
# run the vantage point similarity search
result = web_interface.vp_similarity_search(TimeSeries(range(num_days), stock_data_exclude[stock]), 1)
match_vp = list(result)[0]
ts_vp = web_interface.select(fields=['ts'], md={'pk': match_vp})[match_vp]['ts']
print('VP search result:', match_vp)
# run the isax similarity search
result = web_interface.isax_similarity_search(TimeSeries(range(num_days), stock_data_exclude[stock]))
# could not find an isax match
if result == 'ERROR: NO_MATCH':
print('iSAX search result: Could not find a similar stock.')
# found a match
else:
# closest time series
match_isax = list(result)[0]
ts_isax = web_interface.select(fields=['ts'], md={'pk': match_isax})[match_isax]['ts']
print('iSAX search result:', match_isax)
# visualize similarity
plt.plot(stock_data_exclude[stock], label='Query:' + stock)
plt.plot(ts_vp.values(), label='Result:' + match_vp)
plt.plot(ts_isax.values(), label='Result:' + match_isax)
plt.xticks([])
plt.legend(loc='best')
plt.title('Daily Stock Price Similarity')
plt.show()
Finally, let's visualize the iSAX tree. The clusters represent groups of "similar" stocks.
In [14]:
print(web_interface.isax_tree())
In [15]:
# terminate processes before exiting
os.kill(server.pid, signal.SIGINT)
time.sleep(5) # give it time to terminate
web_interface = None
webserver.terminate()