In [53]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
sys.path.append('../../')
import utils.misc as misc
In [18]:
res_df = pd.read_pickle('../../data/results_ahead1_linear_df.pkl')
In [19]:
res_df.head()
Out[19]:
In [37]:
RELEVANT_COLUMNS = ['base_days',
'train_days',
'r2',
'mre',
'ahead_days',
'train_val_time',
'step_days',
'GOOD_DATA_RATIO',
'SAMPLES_GOOD_DATA_RATIO',
'x_filename',
'y_filename']
best_params_df = res_df[RELEVANT_COLUMNS].loc[np.argmin(res_df['mre']),:]
best_params_df['model'] = 'linear'
best_params_df
Out[37]:
In [38]:
test_df = pd.DataFrame()
test_df.append(best_params_df, ignore_index=True)
Out[38]:
In [39]:
RELEVANT_COLUMNS = ['base_days',
'train_days',
'r2',
'mre',
'ahead_days',
'train_val_time',
'step_days',
'GOOD_DATA_RATIO',
'SAMPLES_GOOD_DATA_RATIO',
'x_filename',
'y_filename']
ahead_days_list = [1, 7, 14, 28, 56]
models_list = ['linear', 'random_forest']
results_df = pd.DataFrame()
for ahead_days in ahead_days_list:
for model in models_list:
res_df = pd.read_pickle('../../data/results_ahead{}_{}_df.pkl'.format(ahead_days, model))
best_params_df = res_df[RELEVANT_COLUMNS].loc[np.argmax(res_df['r2']),:]
best_params_df['ahead_days'] = ahead_days
best_params_df['model'] = model
results_df = results_df.append(best_params_df, ignore_index=True)
In [40]:
results_df
Out[40]:
In [41]:
results_df.to_pickle('../../data/best_dataset_params_raw_df.pkl')
Which is the best model before hyperparameter tuning?
In [42]:
def keep_max_r2(record):
return record.loc[np.argmax(record['r2']),:]
best_r2_df = results_df.groupby('ahead_days').apply(keep_max_r2)
best_r2_df
Out[42]:
In [43]:
best_r2_df[['mre', 'r2']].plot()
Out[43]:
Before hyperparameter tuning, it seems like the linear regression is doing better in all the predictions. Clearly, as the days ahead are more, the r2 value drops, and the mre goes up.
In [44]:
initial_performance_df = results_df[results_df['model']=='random_forest']
initial_performance_df.set_index('ahead_days', inplace=True)
initial_performance_df
Out[44]:
In [45]:
initial_performance_df.loc[14, 'base_days']
Out[45]:
In [100]:
n_estimators = [10, 50, 100, 200]
max_depth = [None, 5, 10, 15]
hyper_df = pd.DataFrame([(x, y) for x in n_estimators for y in max_depth], columns=['n_estimators', 'max_depth'])
hyper_df['n_jobs'] = -1
hyper_df
Out[100]:
In [101]:
params_df = initial_performance_df.loc[1]
params_df
Out[101]:
In [96]:
AHEAD_DAYS = 1
# Get the normal parameters set
params_df = initial_performance_df.loc[AHEAD_DAYS].copy()
params_df['ahead_days'] = AHEAD_DAYS
tic = time()
from predictor.random_forest_predictor import RandomForestPredictor
PREDICTOR_NAME = 'random_forest'
# Global variables
eval_predictor_class = RandomForestPredictor
step_eval_days = 60 # The step to move between training/validation pairs
# Build the params list
params = {'params_df': params_df,
'step_eval_days': step_eval_days,
'eval_predictor_class': eval_predictor_class}
results_df = misc.parallelize_dataframe(hyper_df, misc.search_mean_score_eval, params)
# Some postprocessing... -----------------------------------------------------------
results_df['r2'] = results_df.apply(lambda x: x['scores'][0], axis=1)
results_df['mre'] = results_df.apply(lambda x: x['scores'][1], axis=1)
# Pickle that!
results_df.to_pickle('../../data/hyper_ahead{}_{}_df.pkl'.format(AHEAD_DAYS, PREDICTOR_NAME))
results_df['r2'].plot()
print('Minimum MRE param set: \n {}'.format(results_df.iloc[np.argmin(results_df['mre'])]))
print('Maximum R^2 param set: \n {}'.format(results_df.iloc[np.argmax(results_df['r2'])]))
# -----------------------------------------------------------------------------------
toc = time()
print('Elapsed time: {} seconds.'.format((toc-tic)))
In [102]:
AHEAD_DAYS = 7
# Get the normal parameters set
params_df = initial_performance_df.loc[AHEAD_DAYS].copy()
params_df['ahead_days'] = AHEAD_DAYS
tic = time()
from predictor.random_forest_predictor import RandomForestPredictor
PREDICTOR_NAME = 'random_forest'
# Global variables
eval_predictor_class = RandomForestPredictor
step_eval_days = 60 # The step to move between training/validation pairs
# Build the params list
params = {'params_df': params_df,
'step_eval_days': step_eval_days,
'eval_predictor_class': eval_predictor_class}
results_df = misc.parallelize_dataframe(hyper_df, misc.search_mean_score_eval, params)
# Some postprocessing... -----------------------------------------------------------
results_df['r2'] = results_df.apply(lambda x: x['scores'][0], axis=1)
results_df['mre'] = results_df.apply(lambda x: x['scores'][1], axis=1)
# Pickle that!
results_df.to_pickle('../../data/hyper_ahead{}_{}_df.pkl'.format(AHEAD_DAYS, PREDICTOR_NAME))
results_df['r2'].plot()
print('Minimum MRE param set: \n {}'.format(results_df.iloc[np.argmin(results_df['mre'])]))
print('Maximum R^2 param set: \n {}'.format(results_df.iloc[np.argmax(results_df['r2'])]))
# -----------------------------------------------------------------------------------
toc = time()
print('Elapsed time: {} seconds.'.format((toc-tic)))
In [ ]:
AHEAD_DAYS = 14
# Get the normal parameters set
params_df = initial_performance_df.loc[AHEAD_DAYS].copy()
params_df['ahead_days'] = AHEAD_DAYS
tic = time()
from predictor.random_forest_predictor import RandomForestPredictor
PREDICTOR_NAME = 'random_forest'
# Global variables
eval_predictor_class = RandomForestPredictor
step_eval_days = 60 # The step to move between training/validation pairs
# Build the params list
params = {'params_df': params_df,
'step_eval_days': step_eval_days,
'eval_predictor_class': eval_predictor_class}
results_df = misc.parallelize_dataframe(hyper_df, misc.search_mean_score_eval, params)
# Some postprocessing... -----------------------------------------------------------
results_df['r2'] = results_df.apply(lambda x: x['scores'][0], axis=1)
results_df['mre'] = results_df.apply(lambda x: x['scores'][1], axis=1)
# Pickle that!
results_df.to_pickle('../../data/hyper_ahead{}_{}_df.pkl'.format(AHEAD_DAYS, PREDICTOR_NAME))
results_df['r2'].plot()
print('Minimum MRE param set: \n {}'.format(results_df.iloc[np.argmin(results_df['mre'])]))
print('Maximum R^2 param set: \n {}'.format(results_df.iloc[np.argmax(results_df['r2'])]))
# -----------------------------------------------------------------------------------
toc = time()
print('Elapsed time: {} seconds.'.format((toc-tic)))
In [ ]:
AHEAD_DAYS = 28
# Get the normal parameters set
params_df = initial_performance_df.loc[AHEAD_DAYS].copy()
params_df['ahead_days'] = AHEAD_DAYS
tic = time()
from predictor.random_forest_predictor import RandomForestPredictor
PREDICTOR_NAME = 'random_forest'
# Global variables
eval_predictor_class = RandomForestPredictor
step_eval_days = 60 # The step to move between training/validation pairs
# Build the params list
params = {'params_df': params_df,
'step_eval_days': step_eval_days,
'eval_predictor_class': eval_predictor_class}
results_df = misc.parallelize_dataframe(hyper_df, misc.search_mean_score_eval, params)
# Some postprocessing... -----------------------------------------------------------
results_df['r2'] = results_df.apply(lambda x: x['scores'][0], axis=1)
results_df['mre'] = results_df.apply(lambda x: x['scores'][1], axis=1)
# Pickle that!
results_df.to_pickle('../../data/hyper_ahead{}_{}_df.pkl'.format(AHEAD_DAYS, PREDICTOR_NAME))
results_df['r2'].plot()
print('Minimum MRE param set: \n {}'.format(results_df.iloc[np.argmin(results_df['mre'])]))
print('Maximum R^2 param set: \n {}'.format(results_df.iloc[np.argmax(results_df['r2'])]))
# -----------------------------------------------------------------------------------
toc = time()
print('Elapsed time: {} seconds.'.format((toc-tic)))
In [ ]:
AHEAD_DAYS = 56
# Get the normal parameters set
params_df = initial_performance_df.loc[AHEAD_DAYS].copy()
params_df['ahead_days'] = AHEAD_DAYS
tic = time()
from predictor.random_forest_predictor import RandomForestPredictor
PREDICTOR_NAME = 'random_forest'
# Global variables
eval_predictor_class = RandomForestPredictor
step_eval_days = 60 # The step to move between training/validation pairs
# Build the params list
params = {'params_df': params_df,
'step_eval_days': step_eval_days,
'eval_predictor_class': eval_predictor_class}
results_df = misc.parallelize_dataframe(hyper_df, misc.search_mean_score_eval, params)
# Some postprocessing... -----------------------------------------------------------
results_df['r2'] = results_df.apply(lambda x: x['scores'][0], axis=1)
results_df['mre'] = results_df.apply(lambda x: x['scores'][1], axis=1)
# Pickle that!
results_df.to_pickle('../../data/hyper_ahead{}_{}_df.pkl'.format(AHEAD_DAYS, PREDICTOR_NAME))
results_df['r2'].plot()
print('Minimum MRE param set: \n {}'.format(results_df.iloc[np.argmin(results_df['mre'])]))
print('Maximum R^2 param set: \n {}'.format(results_df.iloc[np.argmax(results_df['r2'])]))
# -----------------------------------------------------------------------------------
toc = time()
print('Elapsed time: {} seconds.'.format((toc-tic)))