In [90]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

from utils import preprocessing as pp


Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [91]:
%pwd


Out[91]:
'/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/notebooks/dev'

In [92]:
# Getting the data
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')

sys.path.append('../../')
import predictor.feature_extraction as fe

train_time = -1 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

tic = time()
x, y = fe.generate_train_intervals(data_df, 
                                   train_time, 
                                   base_days, 
                                   step_days, 
                                   ahead_days, 
                                   today, 
                                   fe.feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))
print(data_df.shape)


Elapsed time: 186 seconds.
(5520, 2415)

Get the results of a single run


In [93]:
from predictor import evaluation as ev
from predictor.dummy_mean_predictor import DummyPredictor

In [94]:
predictor = DummyPredictor()

In [95]:
y_train_true_df, y_train_pred_df, y_val_true_df, y_val_pred_df = ev.run_single_val(x, y, ahead_days, predictor)


Multiindex = True
2014-12-16 00:00:00
23
30

In [96]:
print(y_train_true_df.shape)
print(y_train_pred_df.shape)
print(y_val_true_df.shape)
print(y_val_pred_df.shape)


(314475, 1)
(314475, 1)
(479, 1)
(479, 1)

In [97]:
y_train_true_df.head()


Out[97]:
target
1993-01-29 AAL 19.75
ABBV 2.03
ABC 2.44
ADI 22.75
ADP 15.69

In [98]:
y_train_pred_df.head()


Out[98]:
target
1993-01-29 AAL 19.730000
ABBV 2.111429
ABC 2.595714
ADI 22.822857
ADP 15.481429

In [99]:
y_val_true_df.head()


Out[99]:
target
2014-12-16 A 47.50
AAL 2.65
AAP 56.28
AAPL 167.44
ABBV 113.99

In [11]:
y_val_pred_df.head()


Out[11]:
target
2014-12-16 A 44.962857
AAL 2.592857
AAP 56.491429
AAPL 165.978571
ABBV 111.152857

Done. Let's test the reshape_by_symbol function


In [12]:
y_train_true_rs = ev.reshape_by_symbol(y_train_true_df)
print(y_train_true_rs.shape)
y_train_true_rs.head()


(787, 482)
Out[12]:
ticker A AAL AAP AAPL ABBV ABC ABT ACN ADBE ADI ... XLNX XOM XRAY XRX XYL YHOO YUM ZBH ZION ZTS
date
1993-01-29 NaN 19.75 NaN NaN 2.03 2.44 NaN NaN NaN 22.75 ... 14.40 10.29 NaN 7.25 NaN NaN NaN NaN 11.56 NaN
1993-02-09 NaN 22.12 NaN NaN 1.96 2.50 NaN NaN NaN 21.38 ... 13.82 10.04 NaN 6.75 NaN NaN NaN NaN 10.69 NaN
1993-02-19 NaN 22.25 NaN NaN 1.94 2.44 NaN NaN NaN 21.62 ... 13.97 10.49 NaN 6.42 NaN NaN NaN NaN 11.50 NaN
1993-03-02 NaN 23.75 NaN NaN 2.03 2.44 NaN NaN NaN 22.69 ... 14.05 10.74 NaN 7.50 NaN NaN NaN NaN 11.75 NaN
1993-03-11 NaN 22.25 NaN NaN 1.90 2.39 NaN NaN NaN 22.00 ... 13.59 10.91 NaN 6.80 NaN NaN NaN NaN 12.00 NaN

5 rows × 482 columns


In [13]:
y_train_pred_rs = ev.reshape_by_symbol(y_train_pred_df)
print(y_train_pred_rs.shape)
y_train_pred_rs.head()


(787, 482)
Out[13]:
ticker A AAL AAP AAPL ABBV ABC ABT ACN ADBE ADI ... XLNX XOM XRAY XRX XYL YHOO YUM ZBH ZION ZTS
date
1993-01-29 NaN 19.730000 NaN NaN 2.111429 2.595714 NaN NaN NaN 22.822857 ... 14.224286 10.175714 NaN 7.888571 NaN NaN NaN NaN 11.374286 NaN
1993-02-09 NaN 20.447143 NaN NaN 1.954286 2.451429 NaN NaN NaN 22.285714 ... 14.102857 10.068571 NaN 7.261429 NaN NaN NaN NaN 11.155714 NaN
1993-02-19 NaN 21.767143 NaN NaN 1.934286 2.402857 NaN NaN NaN 21.671429 ... 13.700000 10.248571 NaN 6.497143 NaN NaN NaN NaN 11.205714 NaN
1993-03-02 NaN 23.177143 NaN NaN 1.984286 2.418571 NaN NaN NaN 22.345714 ... 13.998571 10.571429 NaN 6.847143 NaN NaN NaN NaN 11.282857 NaN
1993-03-11 NaN 23.410000 NaN NaN 1.991429 2.430000 NaN NaN NaN 22.285714 ... 13.835714 10.688571 NaN 7.540000 NaN NaN NaN NaN 11.615714 NaN

5 rows × 482 columns


In [14]:
y_val_true_rs = ev.reshape_by_symbol(y_val_true_df)
print(y_val_true_rs.shape)
y_val_true_rs.head()


(1, 479)
Out[14]:
ticker A AAL AAP AAPL ABBV ABC ABT ACN ADBE ADI ... XLNX XOM XRAY XRX XYL YHOO YUM ZBH ZION ZTS
date
2014-12-16 47.5 2.65 56.28 167.44 113.99 59.97 160.0 66.98 91.26 149.12 ... 14.14 20.59 58.4 53.84 38.95 50.86 73.14 114.17 28.56 44.2

1 rows × 479 columns

So, the reshape_by_symbol function seems to work with run_single_val. It could be added to it. Let's test the roll_evaluate function.


In [15]:
u = x.index.levels[0][0]
print(u)


1993-01-29 00:00:00

In [16]:
fe.SPY_DF.sort_index().index.unique()


Out[16]:
DatetimeIndex(['1993-01-29', '1993-02-01', '1993-02-02', '1993-02-03',
               '1993-02-04', '1993-02-05', '1993-02-08', '1993-02-09',
               '1993-02-10', '1993-02-11',
               ...
               '2014-12-17', '2014-12-18', '2014-12-19', '2014-12-22',
               '2014-12-23', '2014-12-24', '2014-12-26', '2014-12-29',
               '2014-12-30', '2014-12-31'],
              dtype='datetime64[ns]', name='date', length=5520, freq=None)

In [17]:
md = fe.SPY_DF.index.unique()

In [18]:
u in md


Out[18]:
True

In [19]:
fe.add_market_days(u,6)


Out[19]:
Timestamp('1993-02-08 00:00:00')

Let's do some previous filtering to avoid problems


In [101]:
# Getting the data
GOOD_DATA_RATIO = 0.99

data_df = pd.read_pickle('../../data/data_train_val_df.pkl')

sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp

data_df = pp.drop_irrelevant_symbols(data_df, GOOD_DATA_RATIO)

train_time = -1 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

tic = time()
x, y = fe.generate_train_intervals(data_df, 
                                   train_time, 
                                   base_days, 
                                   step_days, 
                                   ahead_days, 
                                   today, 
                                   fe.feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))


Elapsed time: 126 seconds.

In [102]:
print(data_df.shape)
data_df.head()


(5520, 1425)
Out[102]:
feature Close ... Volume
SPY MMM ABT ADBE AMD AES AET AFL APD ALK ... HCN WDC WY WHR WFM WMB XEL XRX XLNX ZION
date
1993-01-29 43.94 24.50 6.88 2.59 18.75 4.41 6.42 4.49 21.94 4.19 ... 10600.0 1552200.0 805900.0 230300.0 268800.0 732198.0 87800.0 7633602.0 1745196.0 33600.0
1993-02-01 44.25 24.69 6.88 2.72 19.12 4.53 6.64 4.52 22.38 4.19 ... 11900.0 869600.0 647900.0 160200.0 14400.0 722598.0 72400.0 3001200.0 3574800.0 32000.0
1993-02-02 44.34 24.72 6.53 2.84 20.25 4.53 6.62 4.57 22.31 4.16 ... 9700.0 1149400.0 1189900.0 118100.0 163200.0 808398.0 242200.0 1388598.0 2652396.0 251600.0
1993-02-03 44.81 25.19 6.91 2.70 20.50 4.49 6.50 4.65 22.69 4.22 ... 5300.0 611000.0 1542300.0 246300.0 257600.0 3141198.0 272200.0 1228200.0 5040396.0 254800.0
1993-02-04 45.00 26.06 6.84 2.73 20.12 4.49 6.73 4.84 23.19 4.38 ... 11700.0 1102800.0 1313800.0 354500.0 1473600.0 1550202.0 162800.0 1675602.0 7033200.0 317200.0

5 rows × 1425 columns


In [103]:
SAMPLES_GOOD_DATA_RATIO = 0.9

x_y_df = pd.concat([x, y], axis=1)
x_y_df = pp.drop_irrelevant_samples(x_y_df, SAMPLES_GOOD_DATA_RATIO)
x = x_y_df.iloc[:, :-1]
y = x_y_df.iloc[:, -1]
x = pp.fill_missing(x)

In [104]:
x_y_df.isnull().sum()


Out[104]:
0         0
1         0
2         0
3         0
4         0
5         0
6         0
target    0
dtype: int64

In [105]:
x.isnull().sum().sum()


Out[105]:
0

In [106]:
y.isnull().sum()


Out[106]:
0

In [107]:
x_reshaped = ev.reshape_by_symbol(x)
x_reshaped.head()


Out[107]:
ticker AAPL ABT ADBE ADM ADP ADSK AEP AES AET AFL ... WHR WMB WMT WY XEL XLNX XOM XRAY XRX ZION
date
1993-01-29 21.94 2.59 11.24 223.13 13.44 17.62 4.19 5.94 33.88 4.41 ... 42.88 2.49 4.44 16.41 15.72 14.55 38.34 4.92 7.32 11.81
1993-02-09 22.75 2.59 11.03 225.75 13.44 17.62 4.34 5.50 34.38 4.29 ... 41.88 2.36 4.62 15.47 15.47 13.65 41.08 4.81 6.88 10.38
1993-02-19 21.38 2.73 10.71 213.38 13.56 16.81 4.00 5.81 35.12 4.41 ... 41.88 2.52 4.06 16.50 15.97 13.80 41.96 4.67 6.36 11.44
1993-03-02 21.62 2.75 11.51 210.38 13.69 17.06 4.06 5.59 35.88 4.53 ... 44.62 2.98 4.44 17.00 15.97 14.17 41.21 4.69 7.50 11.44
1993-03-11 22.69 2.80 11.24 212.63 14.31 17.31 4.19 5.78 36.00 4.61 ... 42.25 2.81 2.81 16.56 16.16 13.74 43.71 4.62 6.88 11.81

5 rows × 1995 columns


In [108]:
x_reshaped.isnull().sum().max()


Out[108]:
6

In [109]:
x.shape


Out[109]:
(224493, 7)

In [110]:
x_reshaped.shape


Out[110]:
(788, 1995)

In [111]:
x_reshaped[x_reshaped.notnull()]


Out[111]:
ticker AAPL ABT ADBE ADM ADP ADSK AEP AES AET AFL ... WHR WMB WMT WY XEL XLNX XOM XRAY XRX ZION
date
1993-01-29 21.94 2.59 11.24 223.13 13.44 17.62 4.19 5.94 33.88 4.41 ... 42.88 2.49 4.44 16.41 15.72 14.55 38.34 4.92 7.32 11.81
1993-02-09 22.75 2.59 11.03 225.75 13.44 17.62 4.34 5.50 34.38 4.29 ... 41.88 2.36 4.62 15.47 15.47 13.65 41.08 4.81 6.88 10.38
1993-02-19 21.38 2.73 10.71 213.38 13.56 16.81 4.00 5.81 35.12 4.41 ... 41.88 2.52 4.06 16.50 15.97 13.80 41.96 4.67 6.36 11.44
1993-03-02 21.62 2.75 11.51 210.38 13.69 17.06 4.06 5.59 35.88 4.53 ... 44.62 2.98 4.44 17.00 15.97 14.17 41.21 4.69 7.50 11.44
1993-03-11 22.69 2.80 11.24 212.63 14.31 17.31 4.19 5.78 36.00 4.61 ... 42.25 2.81 2.81 16.56 16.16 13.74 43.71 4.62 6.88 11.81
1993-03-22 22.00 2.70 11.08 210.38 13.72 17.25 4.19 5.17 36.00 4.70 ... 41.25 2.83 3.00 15.56 16.62 13.92 41.33 4.77 7.46 12.25
1993-03-31 22.06 2.69 11.03 193.50 13.66 17.50 4.28 5.44 36.88 4.98 ... 41.75 2.71 2.44 14.44 17.00 13.26 42.71 4.69 6.63 12.06
1993-04-12 21.19 2.66 10.55 185.63 14.25 18.38 4.34 5.31 37.62 5.10 ... 45.50 2.62 2.44 13.19 16.66 13.22 43.96 4.59 5.96 11.56
1993-04-21 21.31 2.91 10.33 197.63 12.69 18.75 4.28 5.12 37.50 5.02 ... 44.25 2.64 2.44 13.38 16.50 12.55 42.71 4.67 5.96 10.81
1993-04-30 21.88 3.58 10.71 198.38 12.41 19.00 4.16 5.06 37.12 5.10 ... 42.88 2.91 2.69 13.06 15.91 12.30 41.21 4.75 6.34 10.44
1993-05-11 21.62 3.72 10.28 193.13 12.47 19.50 4.06 5.92 35.75 4.90 ... 44.50 3.13 2.62 13.94 16.16 12.20 41.96 4.70 6.88 10.00
1993-05-20 22.25 4.19 10.33 204.00 11.72 20.31 4.22 6.69 34.75 4.74 ... 45.25 2.99 2.56 13.94 16.38 12.61 41.21 4.94 6.48 10.56
1993-06-01 21.56 4.22 10.28 205.50 12.00 19.88 4.03 6.22 35.00 4.90 ... 41.88 2.84 2.25 13.31 16.84 12.36 40.21 5.05 6.29 9.88
1993-06-10 21.00 4.22 10.38 202.13 11.56 19.19 4.16 6.47 36.38 4.96 ... 41.00 2.85 2.00 12.56 16.44 12.86 40.58 5.05 5.88 9.62
1993-06-21 20.19 4.01 10.38 202.88 12.25 19.25 3.69 6.69 36.50 5.02 ... 41.50 3.21 1.94 13.12 16.31 13.40 40.21 5.28 5.54 10.06
1993-06-30 19.62 3.86 10.12 210.00 12.41 18.50 3.75 7.05 37.75 4.90 ... 43.50 3.14 2.00 13.12 15.97 12.90 43.33 5.31 5.63 9.69
1993-07-12 20.75 3.67 10.28 211.50 12.69 18.69 3.44 5.97 37.75 5.02 ... 39.12 3.31 1.94 12.88 16.03 12.24 43.46 5.20 5.92 10.12
1993-07-21 20.94 3.29 9.96 210.00 12.44 19.06 3.25 5.69 37.75 5.06 ... 39.12 3.39 2.12 12.88 16.53 12.26 46.20 5.58 6.40 10.12
1993-07-30 20.12 3.09 10.06 213.75 12.59 19.69 3.19 5.16 38.38 5.14 ... 41.00 3.57 2.88 12.75 16.16 12.45 48.58 5.58 6.80 10.31
1993-08-10 21.06 3.37 10.01 217.88 13.09 19.38 3.12 5.70 38.00 5.10 ... 42.12 3.83 3.06 12.75 16.03 12.38 50.07 5.48 6.50 10.66
1993-08-19 21.75 2.95 10.35 223.13 12.84 19.88 3.22 5.17 37.88 5.06 ... 41.12 3.66 2.88 12.94 16.34 12.32 52.70 5.52 5.88 10.50
1993-08-30 22.00 2.64 10.01 227.63 12.66 19.50 3.25 6.19 38.75 5.14 ... 38.88 3.39 2.50 12.06 16.38 12.15 54.07 5.41 5.96 10.28
1993-09-09 20.44 2.38 10.51 216.38 13.16 19.94 3.22 5.77 39.12 5.14 ... 40.50 3.67 2.38 11.94 16.25 11.95 54.70 5.39 6.17 10.50
1993-09-20 19.75 2.14 10.74 206.25 12.44 20.00 3.22 5.88 39.25 5.22 ... 40.38 4.35 2.44 12.25 16.41 12.26 56.82 5.25 5.75 10.62
1993-09-29 19.06 2.33 10.06 200.63 12.66 19.44 3.22 5.75 38.38 5.22 ... 40.88 3.78 2.56 13.38 16.38 12.01 56.07 5.59 6.59 10.50
1993-10-08 19.56 2.41 10.01 203.25 13.00 18.69 3.22 5.52 38.00 5.32 ... 38.75 3.37 3.31 13.38 16.19 12.26 62.31 5.83 6.54 10.94
1993-10-19 20.19 2.69 9.95 206.25 12.12 19.19 3.53 5.72 39.50 5.30 ... 40.88 3.33 3.94 12.94 16.41 11.76 59.44 5.88 6.25 10.56
1993-10-28 21.00 2.64 10.29 202.88 11.59 18.81 4.00 5.44 38.50 5.26 ... 41.50 3.21 3.88 13.62 16.06 13.28 56.94 5.80 6.54 9.94
1993-11-08 21.00 2.53 9.90 204.38 11.62 18.38 3.91 5.02 35.25 5.06 ... 42.75 3.16 4.69 15.00 15.69 13.63 54.94 5.72 6.34 9.69
1993-11-17 21.75 2.59 10.06 206.25 11.78 20.38 4.12 5.22 36.25 5.26 ... 42.75 3.50 4.75 14.94 15.44 13.76 53.20 5.69 6.59 9.19
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2014-02-27 119.34 69.92 39.89 36.09 16.49 128.56 43.15 54.28 49.80 13.78 ... 29.79 53.03 84.25 74.58 94.99 10.93 33.91 109.84 46.86 31.57
2014-03-10 121.10 68.04 42.45 35.82 17.47 126.89 44.46 52.59 49.21 13.84 ... 29.61 53.69 86.32 74.77 94.71 10.85 33.06 110.34 45.88 31.43
2014-03-19 121.04 67.63 42.66 35.52 17.44 122.24 46.19 51.05 48.94 13.83 ... 28.84 53.66 89.70 76.14 96.24 10.91 31.67 106.20 45.21 29.83
2014-03-28 117.30 64.88 43.21 37.44 16.98 124.46 45.36 48.34 50.00 14.29 ... 29.01 52.93 89.60 77.31 96.78 11.43 32.00 105.73 46.05 30.75
2014-04-08 117.12 61.82 43.53 37.59 16.44 124.15 45.40 47.15 51.41 14.38 ... 27.90 51.43 90.28 77.22 99.94 11.50 32.28 107.03 44.70 29.88
2014-04-17 117.94 64.04 45.04 40.68 16.15 127.92 46.47 48.49 51.73 14.26 ... 29.81 46.81 87.53 79.76 101.17 12.00 31.78 104.83 44.58 28.65
2014-04-29 117.13 60.91 43.23 40.32 15.24 128.37 46.97 47.57 53.88 14.32 ... 30.42 46.18 83.35 77.96 103.11 11.83 32.12 105.11 45.71 28.76
2014-05-08 119.04 59.08 44.07 39.84 14.93 130.57 47.35 46.82 53.06 14.08 ... 30.31 45.34 85.91 77.01 100.74 11.93 34.07 105.89 46.39 28.02
2014-05-19 118.23 62.20 43.89 40.53 14.67 131.35 48.50 51.55 51.02 14.00 ... 31.11 46.65 87.20 75.53 101.06 12.20 35.16 108.13 47.24 28.47
2014-05-29 120.22 65.45 44.64 40.74 15.15 135.14 49.70 52.69 52.79 13.96 ... 31.34 46.00 92.16 77.21 101.60 12.85 36.04 110.34 48.26 29.79
2014-06-09 122.95 66.93 45.14 43.05 15.84 137.96 49.50 54.51 53.49 14.39 ... 30.93 47.47 92.32 74.99 102.42 12.76 35.51 107.80 47.69 29.77
2014-06-18 130.72 73.08 44.43 43.92 15.65 132.48 47.58 56.96 54.12 15.03 ... 32.70 46.83 91.89 74.91 102.03 12.34 35.15 107.26 47.95 29.77
2014-06-27 128.61 72.00 43.82 44.79 15.33 128.54 47.69 56.40 55.33 15.53 ... 33.35 48.57 94.36 76.65 102.83 12.20 35.10 110.40 47.49 29.39
2014-07-09 129.28 71.57 46.15 47.07 15.60 126.79 48.43 55.63 54.71 15.38 ... 32.25 47.54 98.15 76.61 102.31 12.81 32.77 109.98 46.27 28.52
2014-07-18 129.98 72.55 48.10 49.47 15.49 127.64 48.73 57.36 54.39 15.19 ... 32.27 41.33 100.81 75.71 104.37 13.10 32.46 111.02 47.19 29.42
2014-07-29 135.86 72.27 47.57 50.73 15.34 122.32 46.18 55.47 54.09 15.10 ... 31.61 41.26 101.88 74.20 98.98 12.99 32.33 108.86 46.83 27.98
2014-08-07 131.16 67.83 48.52 48.00 15.12 119.84 42.76 53.63 49.84 14.39 ... 32.96 42.13 100.62 73.90 99.03 13.39 35.86 112.37 47.34 28.12
2014-08-18 134.14 71.06 49.79 48.96 15.45 124.98 47.01 53.79 51.47 14.75 ... 34.28 41.78 101.95 75.52 99.64 13.67 35.56 114.87 48.40 29.25
2014-08-27 133.00 71.55 49.58 50.13 16.20 128.20 46.50 53.78 53.15 15.04 ... 33.95 43.06 100.92 77.51 99.26 13.53 36.15 115.51 47.55 29.07
2014-09-08 131.99 73.39 50.53 51.03 16.35 127.98 47.33 54.37 53.53 14.94 ... 33.10 43.22 98.10 76.32 97.43 13.70 34.75 116.12 46.57 29.19
2014-09-17 130.92 67.30 51.20 48.84 16.77 127.76 46.16 54.89 53.32 14.47 ... 31.96 42.78 96.19 76.12 94.25 13.26 33.74 115.64 45.38 28.92
2014-09-26 134.77 68.36 50.87 48.57 17.03 128.69 44.42 54.28 52.25 14.22 ... 32.22 41.16 94.70 77.35 94.52 12.99 34.11 113.42 46.01 28.59
2014-10-07 125.85 65.96 49.57 47.85 16.88 123.32 42.80 56.59 52.96 14.24 ... 32.09 37.58 85.10 75.20 90.22 12.71 31.77 105.34 45.72 25.96
2014-10-16 127.71 62.86 43.17 43.80 16.08 120.29 43.80 49.89 53.94 12.98 ... 33.89 43.20 91.68 76.38 94.49 12.55 33.35 112.72 46.14 27.53
2014-10-27 128.55 67.03 44.62 49.14 16.59 122.12 51.82 54.77 56.76 13.42 ... 34.46 43.77 100.00 77.26 94.52 13.16 33.33 109.35 50.77 29.38
2014-11-05 133.97 71.37 50.31 48.78 17.34 124.22 53.97 58.32 59.74 13.95 ... 33.95 42.83 97.83 82.94 94.66 13.45 34.99 112.95 51.86 29.28
2014-11-14 134.71 71.42 51.35 50.82 17.14 128.86 55.34 59.66 56.06 13.47 ... 34.83 44.67 102.76 85.40 95.72 13.68 35.75 114.50 54.48 29.33
2014-11-25 145.35 72.00 52.93 52.11 17.10 134.81 55.91 61.03 56.58 13.92 ... 35.30 46.90 103.72 84.76 94.37 14.20 37.63 109.53 55.61 27.82
2014-12-05 146.73 72.40 53.25 50.85 17.68 132.21 57.26 60.90 57.76 13.68 ... 35.59 44.16 105.40 83.94 86.90 13.34 36.50 102.03 53.41 26.82
2014-12-16 136.82 72.32 49.32 43.29 16.72 124.25 55.68 56.63 57.83 12.79 ... 36.62 44.08 113.11 86.43 93.78 14.08 38.71 107.18 53.76 28.52

788 rows × 1995 columns


In [112]:
y_train_true_df, y_train_pred_df, y_val_true_df, y_val_pred_df = ev.run_single_val(x, y, ahead_days, predictor)


Multiindex = True
2014-12-16 00:00:00
0
0

In [113]:
from sklearn.metrics import r2_score

r2_score(y_train_true_df, y_train_pred_df, multioutput='raw_values')


Out[113]:
array([ 0.99862915])

In [117]:
tickers = y_train_true_df.index.levels[1]
tickers


Out[117]:
Index(['AAPL', 'ABT', 'ADBE', 'ADM', 'ADP', 'ADSK', 'AEP', 'AES', 'AET', 'AFL',
       ...
       'WHR', 'WMB', 'WMT', 'WY', 'XEL', 'XLNX', 'XOM', 'XRAY', 'XRX', 'ZION'],
      dtype='object', length=285)

In [127]:
y_train_true_df.loc[(slice(None), 'AAPL'),:]


Out[127]:
target
1993-01-29 AAPL 22.75
1993-02-09 AAPL 21.38
1993-02-19 AAPL 21.62
1993-03-02 AAPL 22.69
1993-03-11 AAPL 22.00
1993-03-22 AAPL 22.06
1993-03-31 AAPL 21.19
1993-04-12 AAPL 21.31
1993-04-21 AAPL 21.88
1993-04-30 AAPL 21.62
1993-05-11 AAPL 22.25
1993-05-20 AAPL 21.56
1993-06-01 AAPL 21.00
1993-06-10 AAPL 20.19
1993-06-21 AAPL 19.62
1993-06-30 AAPL 20.75
1993-07-12 AAPL 20.94
1993-07-21 AAPL 20.12
1993-07-30 AAPL 21.06
1993-08-10 AAPL 21.75
1993-08-19 AAPL 22.00
1993-08-30 AAPL 20.44
1993-09-09 AAPL 19.75
1993-09-20 AAPL 19.06
1993-09-29 AAPL 19.56
1993-10-08 AAPL 20.19
1993-10-19 AAPL 21.00
1993-10-28 AAPL 21.00
1993-11-08 AAPL 21.75
1993-11-17 AAPL 21.94
... ... ...
2014-02-18 AAPL 119.34
2014-02-27 AAPL 121.10
2014-03-10 AAPL 121.04
2014-03-19 AAPL 117.30
2014-03-28 AAPL 117.12
2014-04-08 AAPL 117.94
2014-04-17 AAPL 117.13
2014-04-29 AAPL 119.04
2014-05-08 AAPL 118.23
2014-05-19 AAPL 120.22
2014-05-29 AAPL 122.95
2014-06-09 AAPL 130.72
2014-06-18 AAPL 128.61
2014-06-27 AAPL 129.28
2014-07-09 AAPL 129.98
2014-07-18 AAPL 135.86
2014-07-29 AAPL 131.16
2014-08-07 AAPL 134.14
2014-08-18 AAPL 133.00
2014-08-27 AAPL 131.99
2014-09-08 AAPL 130.92
2014-09-17 AAPL 134.77
2014-09-26 AAPL 125.85
2014-10-07 AAPL 127.71
2014-10-16 AAPL 128.55
2014-10-27 AAPL 133.97
2014-11-05 AAPL 134.71
2014-11-14 AAPL 145.35
2014-11-25 AAPL 146.73
2014-12-05 AAPL 136.82

787 rows × 1 columns


In [128]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

r2_train_score = []
mae_train = []

for ticker in tickers:
    y_true = y_train_true_df.loc[(slice(None), 'AAPL'),:]
    y_pred = y_train_pred_df.loc[(slice(None), 'AAPL'),:]
    r2_train_score.append(r2_score(y_true, y_pred))
    mae_train.append(mean_absolute_error(y_true, y_pred))

In [132]:
np.mean(r2_train_score)


Out[132]:
0.99653405959847863

In [133]:
np.mean(mae_train)


Out[133]:
1.1550154292975134

In [ ]:


In [72]:
train_days = 252
step_eval_days = 252
r2_train_means, r2_train_stds, y_val_true_df, y_val_pred_df = ev.roll_evaluate(x, 
                                                                               y, 
                                                                               train_days, 
                                                                               step_eval_days, 
                                                                               ahead_days, 
                                                                               predictor, 
                                                                               verbose=True)


Evaluating approximately 20 training/evaluation pairs
Multiindex = True
1994-01-27 00:00:00
0
0
24
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-72-0a55fab3ef2d> in <module>()
      7                                                                                ahead_days,
      8                                                                                predictor,
----> 9                                                                                verbose=True)

/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/predictor/evaluation.py in roll_evaluate(x, y, train_days, step_eval_days, ahead_days, predictor, verbose)
     97         print(y_train_true.isnull().sum().sum())
     98         # Calculate R^2 for training and append
---> 99         scores = r2_score(y_train_true, y_train_pred, multioutput='raw_values')
    100         r2_train_means.append(np.mean(scores))
    101         r2_train_stds.append(np.std(scores))

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/metrics/regression.py in r2_score(y_true, y_pred, sample_weight, multioutput)
    453     """
    454     y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 455         y_true, y_pred, multioutput)
    456 
    457     if sample_weight is not None:

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/metrics/regression.py in _check_reg_targets(y_true, y_pred, multioutput)
     73     """
     74     check_consistent_length(y_true, y_pred)
---> 75     y_true = check_array(y_true, ensure_2d=False)
     76     y_pred = check_array(y_pred, ensure_2d=False)
     77 

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [ ]:
print(len(r2_train_means))
print(len(r2_train_stds))
print(y_val_true_df.shape)
print(y_val_pred_df)

In [27]:
plt.plot(r2_train_means)


Out[27]:
[<matplotlib.lines.Line2D at 0x7f3fa3fbd630>]

In [28]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_val_true_df, y_val_pred_df)
mae


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-f03d5a0b0871> in <module>()
      1 from sklearn.metrics import mean_absolute_error
      2 
----> 3 mae = mean_absolute_error(y_val_true_df, y_val_pred_df)
      4 mae

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/metrics/regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    161     """
    162     y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 163         y_true, y_pred, multioutput)
    164     output_errors = np.average(np.abs(y_pred - y_true),
    165                                weights=sample_weight, axis=0)

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/metrics/regression.py in _check_reg_targets(y_true, y_pred, multioutput)
     73     """
     74     check_consistent_length(y_true, y_pred)
---> 75     y_true = check_array(y_true, ensure_2d=False)
     76     y_pred = check_array(y_pred, ensure_2d=False)
     77 

/home/miguel/anaconda3/envs/cap_env/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    414                              " minimum of %d is required%s."
    415                              % (n_samples, shape_repr, ensure_min_samples,
--> 416                                 context))
    417 
    418     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [ ]: