In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../')
In [2]:
data_df = pd.read_pickle('../data/data_train_val_df.pkl')
print(data_df.shape)
data_df.head()
Out[2]:
In [3]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today):
pass
Let's define the parameters as constants, just to do some scratch work.
In [107]:
# I will try to keep the convention to name with the "days" suffix,
# to all the variables that represent "market days". The ones that
# represent real time will be named more arbitrarily.
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
In [108]:
today
Out[108]:
In [109]:
data_df.index[data_df.index <= today][-(ahead_days + 1)]
Out[109]:
In [134]:
def add_market_days(base, delta, data_df):
"""
base is in real time.
delta is in market days.
"""
market_days = data_df.index
if base not in market_days:
raise Exception('The base date is not in the market days list.')
base_index = market_days.tolist().index(base)
if base_index + delta >= len(market_days):
return market_days[-1]
if base_index + delta < 0:
return market_days[0]
return market_days[base_index + delta]
In [135]:
# Remember the last target days are not used for training, but that is a "market days" period.
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time)
print('Start date: %s. End of training date: %s.' % (start_date, end_of_training_date))
In [136]:
TARGET_FEATURE = 'Close'
In [137]:
def print_period(data_df):
print('Period: %s to %s.' % (data_df.index[0], data_df.index[-1]))
In [138]:
data_train_df = data_df[start_date:end_of_training_date]
print_period(data_train_df)
data_train_df.shape
Out[138]:
In [139]:
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
print_period(data_target_df)
data_target_df.shape
Out[139]:
Is that initial date correct?
In [140]:
data_train_df.index[:10]
Out[140]:
Ok, it looks so.
I should allow for different feature extraction functions to be used, after the time divisions.
In [141]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
sample_blob = (data_train_df[date_base_ini: date_base_end], pd.DataFrame(data_target_df.loc[date_target]))
sample_blob[0]
Out[141]:
In [163]:
target = sample_blob[1].T
target
Out[163]:
In [164]:
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(base_days)
feat_close
Out[164]:
In [165]:
target.index = ['target']
target
Out[165]:
In [166]:
x_y_samples = feat_close.append(target)
x_y_samples
Out[166]:
In [146]:
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_shuffled.head()
Out[146]:
In [147]:
x_y_samples_shuffled.isnull().sum()
Out[147]:
In [148]:
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
print(x_y_samples_filtered.shape)
x_y_samples_filtered.isnull().sum()
Out[148]:
In [149]:
# At some point I will have to standarize those values... (not now, but just as a reminder...)
std_samples = x_y_samples_shuffled.apply(lambda x: x / np.mean(x), axis=1)
std_samples.head()
Out[149]:
In [155]:
features = std_samples.iloc[:,:-1]
features.head()
Out[155]:
In [159]:
target = pd.DataFrame(std_samples.iloc[:,-1])
target.head()
Out[159]:
In [186]:
TARGET_FEATURE = 'Close'
def feature_close_one_to_one(sample_blob):
target = sample_blob[1].T
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(base_days)
target.index = ['target']
x_y_samples = feat_close.append(target)
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
return x_y_samples_filtered
In [188]:
feature_close_one_to_one(sample_blob).head()
Out[188]:
In [189]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
feat_tgt_df = pd.DataFrame()
while date_base_end < end_of_training_date:
sample_blob = (data_train_df[date_base_ini: date_base_end],
pd.DataFrame(data_target_df.loc[date_target]))
feat_tgt_blob = feature_close_one_to_one(sample_blob) # TODO: Change for a generic function
feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
date_base_ini = add_market_days(date_base_ini, step_days, data_df)
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
# print('Start: %s, End:%s' % (date_base_ini, date_base_end))
feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
In [190]:
print(X_df.shape)
X_df.head()
Out[190]:
In [191]:
print(y_df.shape)
y_df.head()
Out[191]:
In [192]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today, blob_fun):
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time)
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_train_df = data_df[start_date:end_of_training_date]
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
feat_tgt_df = pd.DataFrame()
while date_base_end < end_of_training_date:
sample_blob = (data_train_df[date_base_ini: date_base_end],
pd.DataFrame(data_target_df.loc[date_target]))
feat_tgt_blob = blob_fun(sample_blob)
feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
date_base_ini = add_market_days(date_base_ini, step_days, data_df)
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
# print('Start: %s, End:%s' % (date_base_ini, date_base_end))
feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
return X_df, y_df
In [194]:
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
X, y = generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, feature_close_one_to_one)
In [195]:
print(X.shape)
X.head()
Out[195]:
In [196]:
print(y.shape)
y.head()
Out[196]:
In [ ]: