GP13: Predicting the stock market

1: The Dataset & Reading In The Data


In [1]:
# df["Date"] > datetime(year=2015, month=4, day=1)
import pandas as pd
from datetime import datetime

sphist = pd.read_csv("../data/GP13/sphist.csv")

print(sphist["Date"].head(3))
print(sphist["Date"].dtype)

sphist["Date"] = pd.to_datetime(sphist["Date"])
print(sphist["Date"].dtype)

sphist.sort_values("Date", axis=0, ascending=True, inplace=True)

print(sphist["Date"].head(3))
print(sphist.head(3))


0    2015-12-07
1    2015-12-04
2    2015-12-03
Name: Date, dtype: object
object
datetime64[ns]
16589   1950-01-03
16588   1950-01-04
16587   1950-01-05
Name: Date, dtype: datetime64[ns]
            Date   Open   High    Low  Close     Volume  Adj Close
16589 1950-01-03  16.66  16.66  16.66  16.66  1260000.0      16.66
16588 1950-01-04  16.85  16.85  16.85  16.85  1890000.0      16.85
16587 1950-01-05  16.93  16.93  16.93  16.93  2550000.0      16.93

2: Generating Indicators


In [2]:
shifted_close = sphist["Close"].shift(periods=1, freq=None, axis=0)
#sphist["day_5"] = pd.rolling_mean(shifted_close, 5)
#sphist["day_30"] = pd.rolling_mean(shifted_close, 30)
#sphist["day_365"] = pd.rolling_mean(shifted_close, 365)
sphist["day_5"] = shifted_close.rolling(center=False,window=5).mean()
sphist["day_30"] = shifted_close.rolling(center=False,window=30).mean()
sphist["day_365"] = shifted_close.rolling(center=False,window=365).mean()

sphist["std_5"] = shifted_close.rolling(center=False,window=5).std()
sphist["std_365"] = shifted_close.rolling(center=False,window=365).std()

sphist["rday_5_365"] = sphist["day_5"] / sphist["day_365"]
sphist["rstd_5_365"] = sphist["std_5"] / sphist["std_365"]

3: Splitting Up The Data


In [3]:
cols = ["Date", "Close", "day_5","day_30","day_365","std_5","std_365","rday_5_365","rstd_5_365"]
ABT = sphist[cols]
ABT = ABT[ABT["Date"] > datetime(year=1951, month=1, day=2)]
ABT = ABT.dropna(axis=0)
print(ABT[ABT["Date"] > datetime(year=1951, month=1, day=2)].head())

train = ABT[ABT["Date"] < datetime(year=2013, month=1, day=1)]
test = ABT[ABT["Date"] >= datetime(year=2013, month=1, day=1)]

print(train.tail())
print(test.head())


            Date      Close   day_5     day_30    day_365     std_5   std_365  \
16224 1951-06-19  22.020000  21.800  21.703333  19.447726  0.256223  1.790253   
16223 1951-06-20  21.910000  21.900  21.683000  19.462411  0.213659  1.789307   
16222 1951-06-21  21.780001  21.972  21.659667  19.476274  0.092574  1.788613   
16221 1951-06-22  21.549999  21.960  21.631000  19.489562  0.115108  1.787659   
16220 1951-06-25  21.290001  21.862  21.599000  19.502082  0.204132  1.786038   

       rday_5_365  rstd_5_365  
16224    1.120954    0.143121  
16223    1.125246    0.119409  
16222    1.128142    0.051758  
16221    1.126757    0.064390  
16220    1.121008    0.114293  
          Date        Close        day_5       day_30      day_365      std_5  \
743 2012-12-24  1426.660034  1437.360010  1405.926001  1326.114028   7.622009   
742 2012-12-26  1419.829956  1436.620019  1407.486336  1326.412494   8.589693   
741 2012-12-27  1418.099976  1431.228003  1408.813000  1326.716494   9.058684   
740 2012-12-28  1402.430054  1427.685986  1410.265332  1326.995836  10.208568   
739 2012-12-31  1426.189941  1419.434009  1411.830001  1327.261562  10.701861   

       std_365  rday_5_365  rstd_5_365  
743  89.830647    1.083889    0.084849  
742  89.983530    1.083087    0.095458  
741  90.111444    1.078775    0.100528  
740  90.236516    1.075878    0.113131  
739  90.315637    1.069446    0.118494  
          Date        Close        day_5       day_30      day_365      std_5  \
738 2013-01-02  1462.420044  1418.641992  1414.258667  1327.534055   9.820801   
737 2013-01-03  1459.369995  1425.793994  1417.676668  1327.908247  22.261321   
736 2013-01-04  1466.469971  1433.702002  1420.092668  1328.224877  26.274326   
735 2013-01-07  1461.890015  1443.376001  1422.714665  1328.557617  27.945242   
734 2013-01-08  1457.150024  1455.267993  1425.076664  1328.898603  16.453319   

       std_365  rday_5_365  rstd_5_365  
738  90.463948    1.068629    0.108560  
737  90.738976    1.073714    0.245334  
736  90.995857    1.079412    0.288742  
735  91.279049    1.086423    0.306152  
734  91.544368    1.095093    0.179731  

4: Making Predictions


In [4]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

features = ["day_5","day_30","day_365","std_5","std_365","rday_5_365","rstd_5_365"]
target = ["Close"]

lr = LinearRegression()
lr.fit(train[features], train[target])
predictions = lr.predict(test[features])
print(predictions[0:5])
print(test[target][0:5])

mse = mean_squared_error(test["Close"], predictions)
rmse = mse ** (1/2)
print(mse)
print(rmse)


[[ 1419.35440744]
 [ 1425.50578007]
 [ 1433.37973633]
 [ 1443.36069543]
 [ 1457.05569669]]
           Close
738  1462.420044
737  1459.369995
736  1466.469971
735  1461.890015
734  1457.150024
492.923034445
22.2018700664