In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../')


Populating the interactive namespace from numpy and matplotlib

In [2]:
data_df = pd.read_pickle('../data/data_train_val_df.pkl')
print(data_df.shape)
data_df.head()


(5520, 2415)
Out[2]:
feature Close ... Volume
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
date
1993-01-29 43.94 24.50 6.88 NaN NaN NaN NaN 2.59 18.75 NaN ... NaN 87800.0 7633602.0 1745196.0 NaN NaN NaN NaN 33600.0 NaN
1993-02-01 44.25 24.69 6.88 NaN NaN NaN NaN 2.72 19.12 NaN ... NaN 72400.0 3001200.0 3574800.0 NaN NaN NaN NaN 32000.0 NaN
1993-02-02 44.34 24.72 6.53 NaN NaN NaN NaN 2.84 20.25 NaN ... NaN 242200.0 1388598.0 2652396.0 NaN NaN NaN NaN 251600.0 NaN
1993-02-03 44.81 25.19 6.91 NaN NaN NaN NaN 2.70 20.50 NaN ... NaN 272200.0 1228200.0 5040396.0 NaN NaN NaN NaN 254800.0 NaN
1993-02-04 45.00 26.06 6.84 NaN NaN NaN NaN 2.73 20.12 NaN ... NaN 162800.0 1675602.0 7033200.0 NaN NaN NaN NaN 317200.0 NaN

5 rows × 2415 columns

The first objective of this notebook is to implement the next function (to extract sample intervals from the total period).


In [3]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today):
    pass

Let's define the parameters as constants, just to do some scratch work.


In [107]:
# I will try to keep the convention to name with the "days" suffix, 
# to all the variables that represent "market days". The ones that 
# represent real time will be named more arbitrarily.

train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

In [108]:
today


Out[108]:
Timestamp('2014-12-31 00:00:00')

The amount of samples to be generated would be (train_time - base_time) * num_companies / step. There are days_ahead market days left, only for target values, so the total "used" period is train_time + days_ahead.

The option of training with all, one, or some companies can be done by the user when it inputs the data (just filter data_df to get the companies you want). Anyway, one interesting choice would be to allow the training with multiple companies, targeting only one. That would multiply the features by the number of available companies, but would reduce the samples a lot. By now, I want to keep the complexity low, so I won't implement that idea, yet. A many to many approach could also be implemented (the target would be the vector with all the companies data). I will start with the simple "one to one".


In [109]:
data_df.index[data_df.index <= today][-(ahead_days + 1)]


Out[109]:
Timestamp('2014-12-30 00:00:00')

In [134]:
def add_market_days(base, delta, data_df):
    """
    base is in real time.
    delta is in market days.
    """
    market_days = data_df.index
    if base not in market_days:
        raise Exception('The base date is not in the market days list.')
    base_index = market_days.tolist().index(base)
    if base_index + delta >= len(market_days):
        return market_days[-1]
    if base_index + delta < 0:
        return market_days[0]
    return market_days[base_index + delta]

In [135]:
# Remember the last target days are not used for training, but that is a "market days" period.
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time) 
print('Start date: %s.  End of training date: %s.' % (start_date, end_of_training_date))


Start date: 2013-12-30 00:00:00.  End of training date: 2014-12-30 00:00:00.

In [136]:
TARGET_FEATURE = 'Close'

One important thing to note: the base time is in "market days", that means that it doesn't represent a period of "real" time (the real time may vary with each base interval).


In [137]:
def print_period(data_df):
    print('Period: %s  to  %s.' % (data_df.index[0], data_df.index[-1]))

In [138]:
data_train_df = data_df[start_date:end_of_training_date]

print_period(data_train_df)
data_train_df.shape


Period: 2013-12-30 00:00:00  to  2014-12-30 00:00:00.
Out[138]:
(253, 2415)

In [139]:
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]

print_period(data_target_df)
data_target_df.shape


Period: 2014-01-09 00:00:00  to  2014-12-31 00:00:00.
Out[139]:
(247, 483)

Is that initial date correct?


In [140]:
data_train_df.index[:10]


Out[140]:
DatetimeIndex(['2013-12-30', '2013-12-31', '2014-01-02', '2014-01-03',
               '2014-01-06', '2014-01-07', '2014-01-08', '2014-01-09',
               '2014-01-10', '2014-01-13'],
              dtype='datetime64[ns]', name='date', freq=None)

Ok, it looks so.

Let's split now!

I should allow for different feature extraction functions to be used, after the time divisions.


In [141]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)

sample_blob = (data_train_df[date_base_ini: date_base_end], pd.DataFrame(data_target_df.loc[date_target]))
sample_blob[0]


Out[141]:
feature Close ... Volume
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
date
2013-12-30 183.82 139.42 38.41 53.01 82.06 17.80 109.71 59.55 3.85 109.63 ... 1950055.0 1867094.0 3846744.0 1049165.0 561674.0 8678033.0 1953272.0 623173.0 895272.0 1803579.0
2013-12-31 184.69 140.25 38.33 52.81 82.22 17.83 109.32 59.88 3.87 110.68 ... 731522.0 1752814.0 5359541.0 1215365.0 557969.0 8292761.0 2132916.0 649985.0 1077417.0 2270418.0
2014-01-02 182.92 138.13 38.23 51.98 81.13 18.07 107.65 59.29 3.95 109.74 ... 3056065.0 3192314.0 10481405.0 3437019.0 765141.0 21514650.0 1956285.0 868763.0 1356738.0 2576112.0
2014-01-03 182.88 138.45 38.64 52.30 81.40 18.29 108.15 59.16 4.00 112.88 ... 1169540.0 2939378.0 7282652.0 1982702.0 454495.0 15761243.0 1457058.0 1288207.0 1122452.0 2524947.0
2014-01-06 182.36 137.63 39.15 50.39 80.54 18.08 106.28 58.12 4.13 111.80 ... 1289126.0 3382267.0 14906758.0 1970805.0 849360.0 12472724.0 2940835.0 1414955.0 1988180.0 2763350.0
2014-01-07 183.48 137.65 38.85 50.49 81.52 18.32 109.44 58.97 4.18 113.18 ... 1688085.0 3481465.0 15383264.0 1581167.0 611127.0 14141112.0 3625927.0 1852572.0 1343169.0 2338176.0
2014-01-08 183.52 136.63 39.20 50.36 82.15 18.34 110.03 58.90 4.18 112.30 ... 1406668.0 3563670.0 7833434.0 2318930.0 1234973.0 18657195.0 4448753.0 1880549.0 2034692.0 3965882.0

7 rows × 2415 columns


In [163]:
target = sample_blob[1].T
target


Out[163]:
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
2014-01-09 183.64 136.45 39.27 51.22 82.95 18.3 123.5 59.09 4.09 113.55 ... 204.77 27.82 12.05 45.78 34.63 40.92 75.05 96.47 30.22 31.96

1 rows × 483 columns

Let's define a function that takes a "sample blob" and produces one sample per symbol, only for the "Close" feature (looks like the easiest to do first). The dates in the base period should be substituted by an index, and the symbols shuffled later (along with their labels).


In [164]:
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(base_days)
feat_close


Out[164]:
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
0 206.78 165.30 45.77 67.92 89.74 20.04 137.17 74.89 2.55 160.74 ... 145.10 35.35 13.89 43.71 38.10 50.91 71.74 114.94 28.20 43.15
1 206.52 165.48 46.05 67.71 90.51 20.25 137.38 74.45 2.57 159.85 ... 150.11 35.40 13.96 43.00 38.59 50.88 71.17 114.91 28.39 43.51
2 207.47 167.27 46.37 66.97 91.18 20.30 140.05 74.50 2.66 159.44 ... 147.48 35.65 13.97 43.87 38.74 51.15 72.46 115.05 28.18 43.41
3 207.75 166.87 45.72 64.35 91.42 20.23 139.48 74.38 2.67 161.19 ... 151.69 35.43 14.07 43.91 38.72 50.02 72.68 113.47 28.53 42.97
4 207.77 166.96 45.70 66.21 91.32 20.35 139.94 74.74 2.65 160.23 ... 150.90 36.23 14.08 44.08 38.76 50.65 72.79 114.11 28.52 43.84
5 208.44 166.26 45.85 66.98 91.26 20.42 139.66 74.67 2.65 160.00 ... 150.37 36.58 14.14 43.92 38.95 50.86 73.14 114.17 28.56 44.20
6 208.72 166.71 45.60 67.14 90.52 20.36 139.88 74.13 2.66 161.22 ... 153.00 37.25 14.14 43.79 38.86 50.53 73.56 113.50 28.72 44.01

7 rows × 483 columns


In [165]:
target.index = ['target']
target


Out[165]:
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
target 183.64 136.45 39.27 51.22 82.95 18.3 123.5 59.09 4.09 113.55 ... 204.77 27.82 12.05 45.78 34.63 40.92 75.05 96.47 30.22 31.96

1 rows × 483 columns


In [166]:
x_y_samples = feat_close.append(target)
x_y_samples


Out[166]:
SPY MMM ABT ABBV ACN ATVI AYI ADBE AMD AAP ... WYNN XEL XRX XLNX XYL YHOO YUM ZBH ZION ZTS
0 206.78 165.30 45.77 67.92 89.74 20.04 137.17 74.89 2.55 160.74 ... 145.10 35.35 13.89 43.71 38.10 50.91 71.74 114.94 28.20 43.15
1 206.52 165.48 46.05 67.71 90.51 20.25 137.38 74.45 2.57 159.85 ... 150.11 35.40 13.96 43.00 38.59 50.88 71.17 114.91 28.39 43.51
2 207.47 167.27 46.37 66.97 91.18 20.30 140.05 74.50 2.66 159.44 ... 147.48 35.65 13.97 43.87 38.74 51.15 72.46 115.05 28.18 43.41
3 207.75 166.87 45.72 64.35 91.42 20.23 139.48 74.38 2.67 161.19 ... 151.69 35.43 14.07 43.91 38.72 50.02 72.68 113.47 28.53 42.97
4 207.77 166.96 45.70 66.21 91.32 20.35 139.94 74.74 2.65 160.23 ... 150.90 36.23 14.08 44.08 38.76 50.65 72.79 114.11 28.52 43.84
5 208.44 166.26 45.85 66.98 91.26 20.42 139.66 74.67 2.65 160.00 ... 150.37 36.58 14.14 43.92 38.95 50.86 73.14 114.17 28.56 44.20
6 208.72 166.71 45.60 67.14 90.52 20.36 139.88 74.13 2.66 161.22 ... 153.00 37.25 14.14 43.79 38.86 50.53 73.56 113.50 28.72 44.01
target 183.64 136.45 39.27 51.22 82.95 18.30 123.50 59.09 4.09 113.55 ... 204.77 27.82 12.05 45.78 34.63 40.92 75.05 96.47 30.22 31.96

8 rows × 483 columns


In [146]:
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_shuffled.head()


Out[146]:
0 1 2 3 4 5 6 target
0 46.36 46.61 46.74 46.42 45.80 46.09 46.71 48.00
1 35.20 35.16 34.95 34.80 34.96 34.95 34.24 33.54
2 94.24 94.83 94.47 94.70 93.76 94.14 94.74 95.06
3 64.39 64.69 63.58 63.51 63.52 64.69 64.76 66.32
4 37.16 37.32 36.77 36.93 37.43 38.03 38.14 38.40

It is important to take care of the NaN values. Possibly at this sample_blob level is a good point to do so; just discard too bad samples.


In [147]:
x_y_samples_shuffled.isnull().sum()


Out[147]:
0         10
1         10
2         10
3         10
4         10
5         10
6         10
target    10
dtype: int64

In [148]:
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
print(x_y_samples_filtered.shape)
x_y_samples_filtered.isnull().sum()


(473, 8)
Out[148]:
0         0
1         0
2         0
3         0
4         0
5         0
6         0
target    0
dtype: int64

In [149]:
# At some point I will have to standarize those values... (not now, but just as a reminder...)

std_samples = x_y_samples_shuffled.apply(lambda x: x / np.mean(x), axis=1)
std_samples.head()


Out[149]:
0 1 2 3 4 5 6 target
0 0.995037 1.000402 1.003193 0.996324 0.983017 0.989242 1.002549 1.030236
1 1.013679 1.012527 1.006479 1.002160 1.006767 1.006479 0.986033 0.965875
2 0.997328 1.003572 0.999762 1.002196 0.992248 0.996270 1.002619 1.006006
3 0.999340 1.003996 0.986769 0.985683 0.985838 1.003996 1.005083 1.029294
4 0.990339 0.994603 0.979945 0.984209 0.997535 1.013525 1.016457 1.023386

In [155]:
features = std_samples.iloc[:,:-1]
features.head()


Out[155]:
0 1 2 3 4 5 6
0 0.995037 1.000402 1.003193 0.996324 0.983017 0.989242 1.002549
1 1.013679 1.012527 1.006479 1.002160 1.006767 1.006479 0.986033
2 0.997328 1.003572 0.999762 1.002196 0.992248 0.996270 1.002619
3 0.999340 1.003996 0.986769 0.985683 0.985838 1.003996 1.005083
4 0.990339 0.994603 0.979945 0.984209 0.997535 1.013525 1.016457

In [159]:
target = pd.DataFrame(std_samples.iloc[:,-1])
target.head()


Out[159]:
target
0 1.030236
1 0.965875
2 1.006006
3 1.029294
4 1.023386

Let's create the samples divider function


In [186]:
TARGET_FEATURE = 'Close'


def feature_close_one_to_one(sample_blob):
    target = sample_blob[1].T
    feat_close = sample_blob[0][TARGET_FEATURE]
    feat_close.index = np.arange(base_days)
    target.index = ['target']
    x_y_samples = feat_close.append(target)
    x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
    x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
    
    return x_y_samples_filtered

In [188]:
feature_close_one_to_one(sample_blob).head()


Out[188]:
0 1 2 3 4 5 6 target
0 63.59 63.07 63.55 64.04 63.62 63.51 63.56 53.02
1 106.44 105.80 107.14 107.13 107.69 108.25 108.40 92.57
2 59.75 59.63 59.78 59.90 61.32 62.27 63.18 46.02
3 90.92 91.57 92.59 93.52 93.37 94.05 93.78 72.16
4 15.02 15.22 15.38 15.61 15.48 15.57 15.74 17.34

In [189]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
feat_tgt_df = pd.DataFrame()

while date_base_end < end_of_training_date:
    sample_blob = (data_train_df[date_base_ini: date_base_end],
                   pd.DataFrame(data_target_df.loc[date_target]))
    feat_tgt_blob = feature_close_one_to_one(sample_blob) # TODO: Change for a generic function
    feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
    
    date_base_ini = add_market_days(date_base_ini, step_days, data_df)
    date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
    # print('Start: %s,  End:%s' % (date_base_ini, date_base_end))

feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)

X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])

In [190]:
print(X_df.shape)
X_df.head()


(17028, 7)
Out[190]:
0 1 2 3 4 5 6
0 51.59 50.58 50.25 51.05 51.09 49.61 49.62
1 33.71 34.51 34.06 35.05 35.34 35.02 36.48
2 99.15 99.25 99.98 99.68 100.22 100.41 100.58
3 127.25 126.86 126.40 121.64 118.11 119.16 121.58
4 41.63 41.54 40.92 41.06 40.01 39.99 40.86

In [191]:
print(y_df.shape)
y_df.head()


(17028, 1)
Out[191]:
target
0 49.76
1 34.63
2 92.01
3 113.55
4 39.27

So, I have everything to define the final function of this notebook


In [192]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today, blob_fun):
    end_of_training_date = add_market_days(today, -ahead_days, data_df)
    start_date = end_of_training_date - dt.timedelta(train_time)
    start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
    
    data_train_df = data_df[start_date:end_of_training_date]
    data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
    
    date_base_ini = start_date
    date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
    feat_tgt_df = pd.DataFrame()

    while date_base_end < end_of_training_date:
        sample_blob = (data_train_df[date_base_ini: date_base_end],
                       pd.DataFrame(data_target_df.loc[date_target]))
        feat_tgt_blob = blob_fun(sample_blob)
        feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
        
        date_base_ini = add_market_days(date_base_ini, step_days, data_df)
        date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
        # print('Start: %s,  End:%s' % (date_base_ini, date_base_end))
    
    feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
    
    X_df = feat_tgt_df.iloc[:,:-1]
    y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
    
    return X_df, y_df

In [194]:
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

X, y = generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, feature_close_one_to_one)

In [195]:
print(X.shape)
X.head()


(17028, 7)
Out[195]:
0 1 2 3 4 5 6
0 69.52 69.79 69.66 69.45 68.60 68.90 68.76
1 69.14 68.32 69.01 67.82 68.51 67.25 66.84
2 66.25 66.09 66.58 66.38 66.09 66.40 66.08
3 49.64 49.58 49.56 49.09 49.39 49.33 49.08
4 566.88 566.98 569.74 543.14 538.15 554.90 564.14

In [196]:
print(y.shape)
y.head()


(17028, 1)
Out[196]:
target
0 65.02
1 51.82
2 60.28
3 46.16
4 564.55

In [ ]: