In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
In [2]:
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
print(data_df.shape)
data_df.head()
Out[2]:
In [3]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today):
pass
Let's define the parameters as constants, just to do some scratch work.
In [4]:
# I will try to keep the convention to name with the "days" suffix,
# to all the variables that represent "market days". The ones that
# represent real time will be named more arbitrarily.
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
In [5]:
today
Out[5]:
In [6]:
data_df.index[data_df.index <= today][-(ahead_days + 1)]
Out[6]:
In [7]:
def add_market_days(base, delta, data_df):
"""
base is in real time.
delta is in market days.
"""
market_days = data_df.index
if base not in market_days:
raise Exception('The base date is not in the market days list.')
base_index = market_days.tolist().index(base)
if base_index + delta >= len(market_days):
return market_days[-1]
if base_index + delta < 0:
return market_days[0]
return market_days[base_index + delta]
In [8]:
# Remember the last target days are not used for training, but that is a "market days" period.
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time)
print('Start date: %s. End of training date: %s.' % (start_date, end_of_training_date))
In [9]:
TARGET_FEATURE = 'Close'
In [10]:
def print_period(data_df):
print('Period: %s to %s.' % (data_df.index[0], data_df.index[-1]))
In [11]:
data_train_df = data_df[start_date:end_of_training_date]
print_period(data_train_df)
data_train_df.shape
Out[11]:
In [12]:
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
print_period(data_target_df)
data_target_df.shape
Out[12]:
Is that initial date correct?
In [13]:
data_train_df.index[:10]
Out[13]:
Ok, it looks so.
I should allow for different feature extraction functions to be used, after the time divisions.
In [14]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
sample_blob = (data_train_df[date_base_ini: date_base_end], pd.DataFrame(data_target_df.loc[date_target]))
sample_blob[0]
Out[14]:
In [15]:
target = sample_blob[1].T
target
Out[15]:
In [16]:
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(feat_close.shape[0])
feat_close
Out[16]:
In [17]:
target.index = ['target']
target
Out[17]:
In [18]:
x_y_samples = feat_close.append(target)
x_y_samples
Out[18]:
In [19]:
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_shuffled.head()
Out[19]:
In [20]:
x_y_samples_shuffled.isnull().sum()
Out[20]:
In [21]:
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
print(x_y_samples_filtered.shape)
x_y_samples_filtered.isnull().sum()
Out[21]:
In [22]:
# At some point I will have to standarize those values... (not now, but just as a reminder...)
std_samples = x_y_samples_shuffled.apply(lambda x: x / np.mean(x), axis=1)
std_samples.head()
Out[22]:
In [23]:
features = std_samples.iloc[:,:-1]
features.head()
Out[23]:
In [24]:
target = pd.DataFrame(std_samples.iloc[:,-1])
target.head()
Out[24]:
In [25]:
TARGET_FEATURE = 'Close'
def feature_close_one_to_one(sample_blob):
target = sample_blob[1].T
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(feat_close.shape[0])
target.index = ['target']
x_y_samples = feat_close.append(target)
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
return x_y_samples_filtered
In [26]:
print(feature_close_one_to_one(sample_blob).shape)
feature_close_one_to_one(sample_blob).head()
Out[26]:
In [27]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
feat_tgt_df = pd.DataFrame()
while date_base_end < end_of_training_date:
sample_blob = (data_train_df[date_base_ini: date_base_end],
pd.DataFrame(data_target_df.loc[date_target]))
feat_tgt_blob = feature_close_one_to_one(sample_blob) # TODO: Change for a generic function
feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
date_base_ini = add_market_days(date_base_ini, step_days, data_df)
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
# print('Start: %s, End:%s' % (date_base_ini, date_base_end))
feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
In [28]:
print(X_df.shape)
X_df.head()
Out[28]:
In [29]:
print(y_df.shape)
y_df.head()
Out[29]:
In [30]:
def generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, blob_fun):
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time)
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_train_df = data_df[start_date:end_of_training_date]
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
feat_tgt_df = pd.DataFrame()
while date_base_end < end_of_training_date:
sample_blob = (data_train_df[date_base_ini: date_base_end],
pd.DataFrame(data_target_df.loc[date_target]))
feat_tgt_blob = blob_fun(sample_blob)
feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
date_base_ini = add_market_days(date_base_ini, step_days, data_df)
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
# print('Start: %s, End:%s' % (date_base_ini, date_base_end))
feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
return X_df, y_df
In [31]:
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
X, y = generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, feature_close_one_to_one)
In [32]:
print(X.shape)
X.head()
Out[32]:
In [33]:
print(y.shape)
y.head()
Out[33]:
In [34]:
%pwd
Out[34]:
In [35]:
sys.path.append('../../')
import predictor.feature_extraction as fe
In [36]:
X, y = fe.generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
feature_close_one_to_one)
In [37]:
print(X.shape)
X.head()
Out[37]:
In [38]:
print(y.shape)
y.head()
Out[38]:
In [39]:
x_y_samples
Out[39]:
In [40]:
target = sample_blob[1].T
feat_close = sample_blob[0][TARGET_FEATURE]
x_y_samples = feat_close.append(target)
x_y_samples
Out[40]:
In [41]:
x_y_samples.index = pd.MultiIndex.from_product([[x_y_samples.index[0]], np.arange(x_y_samples.shape[0])])
x_y_samples
Out[41]:
In [42]:
x_y_samples.unstack().stack(0).sample(frac=1).reset_index(level=1, drop=True).head()
Out[42]:
In [43]:
TARGET_FEATURE = 'Close'
def feature_close_one_to_one(sample_blob):
target = sample_blob[1].T
feat_close = sample_blob[0][TARGET_FEATURE]
x_y_samples = feat_close.append(target)
x_y_samples.index = pd.MultiIndex.from_product([[x_y_samples.index[0]],
np.arange(x_y_samples.shape[0])])
x_y_samples_shuffled = x_y_samples.unstack().stack(0).sample(frac=1).reset_index(level=1, drop=True)
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
return x_y_samples_filtered
In [44]:
print(feature_close_one_to_one(sample_blob).shape)
feature_close_one_to_one(sample_blob).head()
Out[44]:
In [45]:
def generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, blob_fun):
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time)
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_train_df = data_df[start_date:end_of_training_date]
data_target_df = data_df.loc[start_target_date: today, TARGET_FEATURE]
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
feat_tgt_df = pd.DataFrame()
while date_base_end < end_of_training_date:
sample_blob = (data_train_df[date_base_ini: date_base_end],
pd.DataFrame(data_target_df.loc[date_target]))
feat_tgt_blob = blob_fun(sample_blob)
feat_tgt_df = feat_tgt_df.append(feat_tgt_blob)
date_base_ini = add_market_days(date_base_ini, step_days, data_df)
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
# print('Start: %s, End:%s' % (date_base_ini, date_base_end))
feat_tgt_df = feat_tgt_df.sample(frac=1)
X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1]).rename(columns={7:'target'})
return X_df, y_df
In [46]:
from time import time
tic = time()
X, y = generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))
In [47]:
print(X.shape)
X.head(10)
Out[47]:
In [48]:
print(y.shape)
y.head(10)
Out[48]:
In [49]:
sys.path.append('../../')
import predictor.feature_extraction as fe
X, y = fe.generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
feature_close_one_to_one)
In [50]:
print(X.shape)
X.head(10)
Out[50]:
In [51]:
print(y.shape)
y.head(10)
Out[51]:
In [ ]:
In [55]:
data_df
Out[55]:
In [57]:
base = data_df.index[0]
delta = 252
In [63]:
market_days = np.unique(data_df.sort_index().index)
len(market_days)
Out[63]:
In [ ]:
In [ ]:
def add_market_days(base, delta, data_df):
"""
base is in real time.
delta is in market days.
"""
market_days = data_df.index
if base not in market_days:
raise Exception('The base date is not in the market days list.')
base_index = market_days.tolist().index(base)
if base_index + delta >= len(market_days):
return market_days[-1]
if base_index + delta < 0:
return market_days[0]
return market_days[base_index + delta]