In this notebook the datsets for the predictor will be generated.



In [1]:

    
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys
from time import time
from sklearn.metrics import r2_score, median_absolute_error

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')
import predictor.feature_extraction as fe
import utils.preprocessing as pp









    



Populating the interactive namespace from numpy and matplotlib

Let's first define the list of parameters to use in each dataset.



In [2]:

    
# Input values
GOOD_DATA_RATIO = 0.99  # The ratio of non-missing values for a symbol to be considered good
SAMPLES_GOOD_DATA_RATIO = 0.9  # The ratio of non-missing values for an interval to be considered good

train_val_time = -1  # In real time days (-1 is for the full interval)
''' Step days will be fixed. That means that the datasets with longer base periods will have samples 
that are more correlated. '''
step_days = 7  # market days

base_days = [7, 14, 28, 56, 112]  # In market days
ahead_days = [7, 14, 28, 56]  # market days



In [8]:

    
datasets_params_list_df = pd.DataFrame([(x,y) for x in base_days for y in ahead_days],
                                      columns=['base_days', 'ahead_days'])
datasets_params_list_df

Now, let's define the function to generate each dataset.



In [29]:

    
def generate_one_set(params):
    # print(('-'*70 + '\n {}, {} \n' + '-'*70).format(params['base_days'].values, params['ahead_days'].values))
    
    return params

Finally, let's parallellize the generation of all the datasets, and generate them. (took some code and suggestions from here: http://www.racketracer.com/2016/07/06/pandas-in-parallel/#comments)



In [30]:

    
from multiprocessing import Pool

num_partitions = datasets_params_list_df.shape[0] #number of partitions to split dataframe
num_cores = 4 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df



In [31]:

    
parallelize_dataframe(datasets_params_list_df, generate_one_set)









    



----------------------------------------------------------------------
 [7], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [7], [7] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [14], [28] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [14], [7] 
----------------------------------------------------------------------


----------------------------------------------------------------------
 [7], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [14], [14] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [14], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [7], [14] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [28], [7] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [56], [7] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [56], [28] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [28], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [56], [14] 
----------------------------------------------------------------------
----------------------------------------------------------------------
 [56], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [28], [14] 
----------------------------------------------------------------------


----------------------------------------------------------------------
 [28], [56] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [112], [28] 
--------------------------------------------------------------------------------------------------------------------------------------------
 [112], [7] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [112], [56] 
----------------------------------------------------------------------

----------------------------------------------------------------------
 [112], [14] 
----------------------------------------------------------------------






    Out[31]:







  
    
      
      base_days
      ahead_days
    
  
  
    
      0
      7
      7
    
    
      1
      7
      14
    
    
      2
      7
      28
    
    
      3
      7
      56
    
    
      4
      14
      7
    
    
      5
      14
      14
    
    
      6
      14
      28
    
    
      7
      14
      56
    
    
      8
      28
      7
    
    
      9
      28
      14
    
    
      10
      28
      28
    
    
      11
      28
      56
    
    
      12
      56
      7
    
    
      13
      56
      14
    
    
      14
      56
      28
    
    
      15
      56
      56
    
    
      16
      112
      7
    
    
      17
      112
      14
    
    
      18
      112
      28
    
    
      19
      112
      56



In [ ]:



In [ ]:

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56

	base_days	ahead_days
0	7	7
1	7	14
2	7	28
3	7	56
4	14	7
5	14	14
6	14	28
7	14	56
8	28	7
9	28	14
10	28	28
11	28	56
12	56	7
13	56	14
14	56	28
15	56	56
16	112	7
17	112	14
18	112	28
19	112	56