In this notebook the datsets for the predictor will be generated.


In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp


Populating the interactive namespace from numpy and matplotlib

Let's first define the list of parameters to use in each dataset.


In [2]:
# Input values
GOOD_DATA_RATIO = 0.99  # The ratio of non-missing values for a symbol to be considered good
SAMPLES_GOOD_DATA_RATIO = 0.9  # The ratio of non-missing values for an interval to be considered good

train_val_time = -1  # In real time days (-1 is for the full interval)
''' Step days will be fixed. That means that the datasets with longer base periods will have samples 
that are more correlated. '''
step_days = 7  # market days

base_days = [7, 14, 28, 56, 112]  # In market days
ahead_days = [7, 14, 28, 56]  # market days

In [8]:
datasets_params_list_df = pd.DataFrame([(x,y) for x in base_days for y in ahead_days],
                                      columns=['base_days', 'ahead_days'])
datasets_params_list_df


Out[8]:
base_days ahead_days
0 7 7
1 7 14
2 7 28
3 7 56
4 14 7
5 14 14
6 14 28
7 14 56
8 28 7
9 28 14
10 28 28
11 28 56
12 56 7
13 56 14
14 56 28
15 56 56
16 112 7
17 112 14
18 112 28
19 112 56

Now, let's define the function to generate each dataset.


In [29]:
def generate_one_set(params):
    # print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
    
    return params

Finally, let's parallellize the generation of all the datasets, and generate them. (took some code and suggestions from here: http://www.racketracer.com/2016/07/06/pandas-in-parallel/#comments)


In [30]:
from multiprocessing import Pool

num_partitions = datasets_params_list_df.shape[0] #number of partitions to split dataframe
num_cores = 4 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [31]:
parallelize_dataframe(datasets_params_list_df, generate_one_set)


----------------------------------------------------------------------
 [7], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [7], [7] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [14], [28] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [14], [7] 
----------------------------------------------------------------------


----------------------------------------------------------------------
 [7], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [14], [14] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [14], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [7], [14] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [28], [7] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [56], [7] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [56], [28] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [28], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [56], [14] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [56], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [28], [14] 
----------------------------------------------------------------------


----------------------------------------------------------------------
 [28], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [112], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [112], [7] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [112], [56] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [112], [14] 
----------------------------------------------------------------------
Out[31]:
base_days ahead_days
0 7 7
1 7 14
2 7 28
3 7 56
4 14 7
5 14 14
6 14 28
7 14 56
8 28 7
9 28 14
10 28 28
11 28 56
12 56 7
13 56 14
14 56 28
15 56 56
16 112 7
17 112 14
18 112 28
19 112 56

In [ ]:


In [ ]: