Missing data shall never be filled in the target variable, or the results evaluation would be corrupted. That is a risk on this problem, if things are done without care, because the target variable and the features are the same, only time-shifted.
First forward and then backwards fill is the best way to try to keep causality as much as possible.
Some filtering of symbols that have a lot of missing data could help, or the predictor may find itself full of constant data.
Filling missing data and dropping "bad samples" can be done in two or three levels: In the total data level, in the training time level, or in the base samples level. The differences are probably small for the filling part, but may be significant when dropping samples.
In [120]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
In [121]:
from utils import preprocessing as pp
In [122]:
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
In [123]:
print(data_df.shape)
data_df.head()
Out[123]:
In [124]:
data_df.columns.nlevels
Out[124]:
In [125]:
data_df['Close'].shape
Out[125]:
In [126]:
good_ratios = 1.0 - (data_df['Close'].isnull().sum()/ data_df['Close'].shape[0])
good_ratios.sort_values(ascending=False).plot()
Out[126]:
In [127]:
filtered_data_df = pp.drop_irrelevant_symbols(data_df['Close'], good_data_ratio=0.99)
In [128]:
good_ratios = 1.0 - (filtered_data_df.isnull().sum()/ filtered_data_df.shape[0])
good_ratios.sort_values(ascending=False).plot()
Out[128]:
In [129]:
filtered_data_df.shape
Out[129]:
In [130]:
filtered_data_df.head()
Out[130]:
In [131]:
filtered_data_df.isnull().sum().sort_values(ascending=False)
Out[131]:
In [132]:
good_data_ratio = 0.99
FEATURE_OF_INTEREST = 'Close'
filtered_data_df = data_df[FEATURE_OF_INTEREST].dropna(thresh=math.ceil(good_data_ratio*data_df[FEATURE_OF_INTEREST].shape[0]), axis=1)
In [133]:
filtered_data_df.head()
Out[133]:
In [134]:
filtered_data_df.columns
Out[134]:
In [135]:
fdata_df = data_df.loc[:,(slice(None),filtered_data_df.columns.tolist())]
In [136]:
new_cols = fdata_df.columns.get_level_values(1)
In [137]:
np.setdiff1d(new_cols, filtered_data_df.columns)
Out[137]:
In [138]:
np.setdiff1d(filtered_data_df.columns, new_cols)
Out[138]:
In [139]:
np.intersect1d(filtered_data_df.columns, new_cols).shape
Out[139]:
In [140]:
filtered_data_df.columns.shape
Out[140]:
In [142]:
filtered_data_df = pp.drop_irrelevant_symbols(data_df, good_data_ratio=0.99)
In [143]:
good_ratios = 1.0 - (filtered_data_df['Close'].isnull().sum()/ filtered_data_df['Close'].shape[0])
good_ratios.sort_values(ascending=False).plot()
Out[143]:
In [145]:
import predictor.feature_extraction as fe
train_time = -1 # In real time days
base_days = 7 # In market days
step_days = 30 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date
tic = time()
x, y = fe.generate_train_intervals(data_df,
train_time,
base_days,
step_days,
ahead_days,
today,
fe.feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))
In [146]:
x.shape
Out[146]:
In [147]:
y.shape
Out[147]:
In [152]:
x_y_df = pd.concat([x, y], axis=1)
In [153]:
x_y_df.shape
Out[153]:
In [154]:
x_y_df.head()
Out[154]:
In [157]:
x_y_df.isnull().sum(axis=1)
Out[157]:
In [ ]: