QML-RG Homework 7

Predicting the stock market with Phased LSTMs

Alejandro Pozas-Kerstjens

p-LSTM cells introduced in https://github.com/dannyneil/public_plstm, Keras implementation from https://github.com/fferroni/PhasedLSTM-Keras

The idea of the notebook is the following: We have a collection of values for stocks, and given the evolution of all but one of them, we want to predict what will be the value of the remaining one on the last (think of it as the current) moment.


In [1]:
import numpy as np
import pandas as pd

from keras.layers.core import Dense
from keras.models import Sequential
from keras.layers.recurrent import LSTM
import matplotlib.pyplot as plt

try:              # For downloading the dataset
    from urllib2 import Request, urlopen
except ImportError:
    from urllib.request import Request, urlopen
import zipfile    # For unzipping the dataset
    
from phased_lstm_keras.PhasedLSTM import PhasedLSTM    #Phased LSTM implementation


Using TensorFlow backend.

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00312/dow_jones_index.zip"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
content = urlopen(req).read()

with open("dow_jones_index.zip", 'wb') as f:
    f.write(content)

with zipfile.ZipFile("dow_jones_index.zip","r") as zip_ref:
    zip_ref.extractall("dow_jones_index")

In [3]:
prices_dataset =  pd.read_csv('dow_jones_index/dow_jones_index.data', header=0)

# To remove the dollar signs
#def drop_dollar(string):
#    result = string.replace("$", " ")
#    return " ".join(result.split())

#for col in ['open', 'high', 'low', 'close', 'next_weeks_open', 'next_weeks_close']:
#    prices_dataset[col] = prices_dataset[col].apply(lambda x: drop_dollar(x))

prices_dataset


Out[3]:
quarter stock date open high low close volume percent_change_price percent_change_volume_over_last_wk previous_weeks_volume next_weeks_open next_weeks_close percent_change_next_weeks_price days_to_next_dividend percent_return_next_dividend
0 1 AA 1/7/2011 $15.82 $16.72 $15.78 $16.42 239655616 3.792670 NaN NaN $16.71 $15.97 -4.428490 26 0.182704
1 1 AA 1/14/2011 $16.71 $16.71 $15.64 $15.97 242963398 -4.428490 1.380223 239655616.0 $16.19 $15.79 -2.470660 19 0.187852
2 1 AA 1/21/2011 $16.19 $16.38 $15.60 $15.79 138428495 -2.470660 -43.024959 242963398.0 $15.87 $16.13 1.638310 12 0.189994
3 1 AA 1/28/2011 $15.87 $16.63 $15.82 $16.13 151379173 1.638310 9.355500 138428495.0 $16.18 $17.14 5.933250 5 0.185989
4 1 AA 2/4/2011 $16.18 $17.39 $16.18 $17.14 154387761 5.933250 1.987452 151379173.0 $17.33 $17.37 0.230814 97 0.175029
5 1 AA 2/11/2011 $17.33 $17.48 $16.97 $17.37 114691279 0.230814 -25.712195 154387761.0 $17.39 $17.28 -0.632547 90 0.172712
6 1 AA 2/18/2011 $17.39 $17.68 $17.28 $17.28 80023895 -0.632547 -30.226696 114691279.0 $16.98 $16.68 -1.766780 83 0.173611
7 1 AA 2/25/2011 $16.98 $17.15 $15.96 $16.68 132981863 -1.766780 66.177694 80023895.0 $16.81 $16.58 -1.368230 76 0.179856
8 1 AA 3/4/2011 $16.81 $16.94 $16.13 $16.58 109493077 -1.368230 -17.663150 132981863.0 $16.58 $16.03 -3.317250 69 0.180941
9 1 AA 3/11/2011 $16.58 $16.75 $15.42 $16.03 114332562 -3.317250 4.419900 109493077.0 $15.95 $16.11 1.003130 62 0.187149
10 1 AA 3/18/2011 $15.95 $16.33 $15.43 $16.11 130374108 1.003130 14.030601 114332562.0 $16.38 $17.09 4.334550 55 0.186220
11 1 AA 3/25/2011 $16.38 $17.24 $16.26 $17.09 95550392 4.334550 -26.710607 130374108.0 $17.13 $17.47 1.984820 48 0.175541
12 1 AXP 1/7/2011 $43.30 $45.60 $43.11 $44.36 45102042 2.448040 NaN NaN $44.20 $46.25 4.638010 89 0.405771
13 1 AXP 1/14/2011 $44.20 $46.25 $44.01 $46.25 25913713 4.638010 -42.544258 45102042.0 $46.03 $46.00 -0.065175 82 0.389189
14 1 AXP 1/21/2011 $46.03 $46.71 $44.71 $46.00 38824728 -0.065175 49.823099 25913713.0 $46.05 $43.86 -4.755700 75 0.391304
15 1 AXP 1/28/2011 $46.05 $46.27 $43.42 $43.86 51427274 -4.755700 32.460101 38824728.0 $44.13 $43.82 -0.702470 68 0.410397
16 1 AXP 2/4/2011 $44.13 $44.23 $43.15 $43.82 39501680 -0.702470 -23.189240 51427274.0 $43.96 $46.75 6.346680 61 0.410771
17 1 AXP 2/11/2011 $43.96 $46.79 $43.88 $46.75 43746998 6.346680 10.747183 39501680.0 $46.42 $45.53 -1.917280 54 0.385027
18 1 AXP 2/18/2011 $46.42 $46.93 $45.53 $45.53 28564910 -1.917280 -34.704297 43746998.0 $44.94 $43.53 -3.137520 47 0.395344
19 1 AXP 2/25/2011 $44.94 $45.12 $43.01 $43.53 39654146 -3.137520 38.821183 28564910.0 $43.73 $43.72 -0.022868 40 0.413508
20 1 AXP 3/4/2011 $43.73 $44.68 $42.75 $43.72 38985037 -0.022868 -1.687362 39654146.0 $43.86 $44.28 0.957592 33 0.411711
21 1 AXP 3/11/2011 $43.86 $45.54 $43.53 $44.28 37613429 0.957592 -3.518293 38985037.0 $43.86 $44.17 0.706794 26 0.406504
22 1 AXP 3/18/2011 $43.86 $44.47 $42.19 $44.17 41757526 0.706794 11.017600 37613429.0 $44.75 $45.59 1.877090 19 0.407516
23 1 AXP 3/25/2011 $44.75 $45.61 $44.10 $45.59 30798332 1.877090 -26.244835 41757526.0 $45.54 $45.36 -0.395257 12 0.394823
24 1 BA 1/7/2011 $66.15 $70.10 $66.00 $69.38 36258120 4.882840 NaN NaN $69.42 $70.07 0.936330 33 0.605362
25 1 BA 1/14/2011 $69.42 $70.50 $68.35 $70.07 18834664 0.936330 -48.053942 36258120.0 $70.86 $71.68 1.157210 26 0.599401
26 1 BA 1/21/2011 $70.86 $72.99 $70.23 $71.68 29594221 1.157210 57.126355 18834664.0 $71.52 $69.23 -3.201900 19 0.585938
27 1 BA 1/28/2011 $71.52 $72.82 $69.00 $69.23 34929673 -3.201900 18.028696 29594221.0 $69.26 $71.38 3.060930 12 0.606673
28 1 BA 2/4/2011 $69.26 $71.64 $69.12 $71.38 22770062 3.060930 -34.811694 34929673.0 $71.43 $72.14 0.993980 5 0.588400
29 1 BA 2/11/2011 $71.43 $72.99 $71.15 $72.14 21809411 0.993980 -4.218921 22770062.0 $72.70 $73.04 0.467675 89 0.582201
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
720 2 VZ 6/3/2011 $36.91 $36.99 $35.60 $35.63 50174214 -3.467890 15.608455 43400125.0 $35.57 $35.19 -1.068320 33 1.375250
721 2 VZ 6/10/2011 $35.57 $36.14 $35.16 $35.19 72100710 -1.068320 43.700726 50174214.0 $35.23 $35.51 0.794777 26 1.392440
722 2 VZ 6/17/2011 $35.23 $35.92 $34.94 $35.51 69585912 0.794777 -3.487896 72100710.0 $35.35 $36.00 1.838760 19 1.379890
723 2 VZ 6/24/2011 $35.35 $36.17 $35.20 $36.00 82375230 1.838760 18.379177 69585912.0 $36.00 $37.80 5.000000 12 1.361110
724 2 WMT 4/1/2011 $52.31 $52.74 $51.89 $52.13 44519431 -0.344102 -16.607132 53385178.0 $52.31 $52.54 0.439686 40 0.709764
725 2 WMT 4/8/2011 $52.31 $53.24 $52.06 $52.54 48382262 0.439686 8.676730 44519431.0 $52.54 $53.55 1.922340 33 0.704225
726 2 WMT 4/15/2011 $52.54 $53.96 $52.40 $53.55 45662991 1.922340 -5.620388 48382262.0 $53.08 $53.58 0.941974 26 0.690943
727 2 WMT 4/21/2011 $53.08 $53.95 $52.91 $53.58 23073491 0.941974 -49.470040 45662991.0 $53.60 $54.98 2.574630 20 0.690556
728 2 WMT 4/29/2011 $53.60 $55.16 $53.23 $54.98 47159044 2.574630 104.386254 23073491.0 $55.08 $55.02 -0.108932 12 0.672972
729 2 WMT 5/6/2011 $55.08 $55.79 $53.20 $55.02 46668727 -0.108932 -1.039709 47159044.0 $54.97 $55.72 1.364380 5 0.672483
730 2 WMT 5/13/2011 $54.97 $56.03 $54.70 $55.72 42649301 1.364380 -8.612675 46668727.0 $55.61 $55.29 -0.575436 89 0.664034
731 2 WMT 5/20/2011 $55.61 $56.47 $54.95 $55.29 56695842 -0.575436 32.934985 42649301.0 $54.90 $54.70 -0.364299 82 0.669199
732 2 WMT 5/27/2011 $54.90 $55.50 $54.10 $54.70 35560853 -0.364299 -37.277847 56695842.0 $54.88 $53.66 -2.223030 75 0.676417
733 2 WMT 6/3/2011 $54.88 $55.22 $53.04 $53.66 48156051 -2.223030 35.418717 35560853.0 $53.86 $52.72 -2.116600 68 0.689527
734 2 WMT 6/10/2011 $53.86 $54.30 $52.69 $52.72 58746396 -2.116600 21.991722 48156051.0 $52.91 $52.82 -0.170100 61 0.701821
735 2 WMT 6/17/2011 $52.91 $53.29 $51.79 $52.82 68996550 -0.170100 17.448141 58746396.0 $52.70 $52.41 -0.550285 54 0.700492
736 2 WMT 6/24/2011 $52.70 $53.70 $52.35 $52.41 75602550 -0.550285 9.574392 68996550.0 $52.32 $53.51 2.274460 47 0.705972
737 2 XOM 4/1/2011 $83.86 $84.94 $82.61 $84.68 77569311 0.977820 -6.951645 83364516.0 $84.26 $85.95 2.005700 40 0.555031
738 2 XOM 4/8/2011 $84.26 $86.22 $84.07 $85.95 75566590 2.005700 -2.581847 77569311.0 $85.95 $84.29 -1.931360 33 0.546830
739 2 XOM 4/15/2011 $85.95 $86.15 $82.38 $84.29 81056144 -1.931360 7.264525 75566590.0 $83.11 $86.36 3.910480 26 0.557599
740 2 XOM 4/21/2011 $83.11 $86.36 $82.44 $86.36 43378708 3.910480 -46.483134 81056144.0 $86.29 $87.98 1.958510 20 0.544233
741 2 XOM 4/29/2011 $86.29 $88.00 $85.89 $87.98 81596868 1.958510 88.103500 43378708.0 $88.10 $82.69 -6.140750 12 0.534212
742 2 XOM 5/6/2011 $88.10 $88.13 $81.59 $82.69 113805856 -6.140750 39.473314 81596868.0 $83.01 $80.87 -2.578000 5 0.568388
743 2 XOM 5/13/2011 $83.01 $83.76 $79.42 $80.87 99678100 -2.578000 -12.413910 113805856.0 $80.22 $81.57 1.682870 89 0.581180
744 2 XOM 5/20/2011 $80.22 $82.58 $79.60 $81.57 86758820 1.682870 -12.961001 99678100.0 $80.22 $82.63 3.004240 82 0.576192
745 2 XOM 5/27/2011 $80.22 $82.63 $80.07 $82.63 68230855 3.004240 -21.355713 86758820.0 $83.28 $81.18 -2.521610 75 0.568801
746 2 XOM 6/3/2011 $83.28 $83.75 $80.18 $81.18 78616295 -2.521610 15.221032 68230855.0 $80.93 $79.78 -1.420980 68 0.578960
747 2 XOM 6/10/2011 $80.93 $81.87 $79.72 $79.78 92380844 -1.420980 17.508519 78616295.0 $80.00 $79.02 -1.225000 61 0.589120
748 2 XOM 6/17/2011 $80.00 $80.82 $78.33 $79.02 100521400 -1.225000 8.811952 92380844.0 $78.65 $76.78 -2.377620 54 0.594786
749 2 XOM 6/24/2011 $78.65 $81.12 $76.78 $76.78 118679791 -2.377620 18.064204 100521400.0 $76.88 $82.01 6.672740 47 0.612139

750 rows × 16 columns


In [4]:
stocks = [prices_dataset['stock'][0]]
for stock in prices_dataset['stock']:
    if stocks[-1] != stock:
        stocks.append(stock)
        
stocks = stocks[:int(len(stocks)/2)]    #Remove duplicates

In [5]:
# Arrange the percent change in stocks and store the weekly values
prices = []
for stock in stocks:
    stockinfo = prices_dataset[prices_dataset['stock']==stock]
    prices.append(stockinfo.percent_change_price.values.astype('float32'))
prices = np.array(prices)

# Read the percent change for the next week
nw_prices = []
for stock in stocks:
    stockinfo = prices_dataset[prices_dataset['stock']==stock]
    nw_prices.append(stockinfo.percent_change_next_weeks_price.values.astype('float32'))
nw_prices = np.array(nw_prices)

In [6]:
for price in prices:
    plt.plot(price)
plt.show()



In [7]:
# Define training and test sets
trainX, testX = prices[:-1,:], prices[-1,:]
trainY, testY = nw_prices[:-1,-1], nw_prices[-1,-1]

#Reshapings to feed the NN correctly
trainX = trainX.reshape((*trainX.shape, 1))
testX = testX.reshape((1, testX.shape[0], 1))

In [8]:
# Build pLSTM Model
model_plstm = Sequential()

model_plstm.add(PhasedLSTM(100, input_shape=(25, 1), dropout=0.2))

model_plstm.add(Dense(1, activation='linear'))

model_plstm.compile(loss='mse', optimizer='adam')

# And LSTM model to compare
model_lstm = Sequential()

model_lstm.add(LSTM(100, input_shape=(25, 1), dropout=0.2))

model_lstm.add(Dense(1, activation='linear'))

model_lstm.compile(loss='mse', optimizer='adam')

In [9]:
model_plstm.fit(trainX, trainY, epochs=20, batch_size=1)
model_lstm.fit(trainX, trainY, epochs=20, batch_size=1)


Epoch 1/20
29/29 [==============================] - 1s - loss: 28.8505      
Epoch 2/20
29/29 [==============================] - 0s - loss: 26.5135     
Epoch 3/20
29/29 [==============================] - 0s - loss: 21.4088     
Epoch 4/20
29/29 [==============================] - 0s - loss: 9.4075      
Epoch 5/20
29/29 [==============================] - 0s - loss: 2.5362     
Epoch 6/20
29/29 [==============================] - 0s - loss: 2.4428     
Epoch 7/20
29/29 [==============================] - 0s - loss: 1.9597     
Epoch 8/20
29/29 [==============================] - 0s - loss: 2.0989     
Epoch 9/20
29/29 [==============================] - 0s - loss: 2.0810     
Epoch 10/20
29/29 [==============================] - 0s - loss: 2.6290      
Epoch 11/20
29/29 [==============================] - 0s - loss: 2.0493     
Epoch 12/20
29/29 [==============================] - 0s - loss: 1.7554     
Epoch 13/20
29/29 [==============================] - 0s - loss: 1.7525     
Epoch 14/20
29/29 [==============================] - 0s - loss: 2.1139     
Epoch 15/20
29/29 [==============================] - 0s - loss: 1.7890     
Epoch 16/20
29/29 [==============================] - 0s - loss: 2.0589     
Epoch 17/20
29/29 [==============================] - 0s - loss: 1.9555     
Epoch 18/20
29/29 [==============================] - 0s - loss: 1.7033     
Epoch 19/20
29/29 [==============================] - 0s - loss: 2.0388     
Epoch 20/20
29/29 [==============================] - 0s - loss: 1.2213     
Epoch 1/20
29/29 [==============================] - 0s - loss: 23.9491      
Epoch 2/20
29/29 [==============================] - 0s - loss: 6.0394     
Epoch 3/20
29/29 [==============================] - 0s - loss: 3.0180     
Epoch 4/20
29/29 [==============================] - 0s - loss: 2.7289     
Epoch 5/20
29/29 [==============================] - 0s - loss: 2.6984     
Epoch 6/20
29/29 [==============================] - 0s - loss: 2.6432         
Epoch 7/20
29/29 [==============================] - 0s - loss: 2.6630         
Epoch 8/20
29/29 [==============================] - 0s - loss: 2.4997     
Epoch 9/20
29/29 [==============================] - 0s - loss: 2.2893     
Epoch 10/20
29/29 [==============================] - 0s - loss: 2.2467     
Epoch 11/20
29/29 [==============================] - 0s - loss: 2.2890     
Epoch 12/20
29/29 [==============================] - 0s - loss: 2.3353     
Epoch 13/20
29/29 [==============================] - 0s - loss: 2.6675     
Epoch 14/20
29/29 [==============================] - 0s - loss: 2.3688     
Epoch 15/20
29/29 [==============================] - 0s - loss: 2.0349         
Epoch 16/20
29/29 [==============================] - 0s - loss: 2.3596     
Epoch 17/20
29/29 [==============================] - 0s - loss: 2.1883     
Epoch 18/20
29/29 [==============================] - 0s - loss: 1.9615     
Epoch 19/20
29/29 [==============================] - 0s - loss: 2.1379     
Epoch 20/20
29/29 [==============================] - 0s - loss: 2.0856     
Out[9]:
<keras.callbacks.History at 0x15fa7710>

In [10]:
pred_plstm = model_plstm.predict(testX)
pred_lstm = model_lstm.predict(testX)

err_plstm = abs(pred_plstm[0][0] - testY) / testY
err_lstm = abs(pred_lstm[0][0] - testY) / testY

print('The error with Phased LSTM is {}, while with LSTM is {}'.format(err_plstm, err_lstm))


The error with Phased LSTM is 0.08319906890392303, while with LSTM is 0.27037063241004944