notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc



In [2]:

    
pd.set_option('display.max_columns', 500)



In [3]:

    
%pwd









    Out[3]:





u'/home/siyuan/Documents/bimbo_kaggle'



In [4]:

    
%ls









    



1.5_create_lag.ipynb              pivot_test.pickle
1_predata.ipynb                   pivot_train_with_nan.pickle
1_predata_whole.ipynb             preprocessed_products.csv
3_xgb_43fea.ipynb                 RF_model/
3_xgb.ipynb                       ruta_for_cliente_producto.csv
3_xgb_prediction.ipynb            stack_sub/
3_xgb_test.ipynb                  submission_10_new.csv
4_keras_nn.ipynb                  submission_11_new.csv
5_random_forest.ipynb             submission_44fea.csv
6_stack_model.ipynb               submission_all_train_10.csv
7_SGD_regressor.ipynb             submission_all_train_11.csv
8_svm_linearSVR.ipynb             submission_all_train_12.csv
9_private_board.ipynb             submission_all_train_13.csv
agencia_for_cliente_producto.csv  submission_all_train_14.csv
bst_1000_eta001_2.model           submission_all_train_2.csv
bst_1000_eta001.model             submission_all_train_3.csv
bst_1000_eta002.model             submission_all_train_4.csv
bst_1000.model                    submission_all_train_5.csv
bst_1200_eta0015.model            submission_all_train_6.csv
bst_1200_eta005.model             submission_all_train_7.csv
bst_1400_eta0015.model            submission_all_train_8.csv
bst_1400_eta002.model             submission_all_train_9.csv
bst_1800_eta0015.model            submission_all_train.csv
bst_200_eta005.model              submission_nn.csv
bst_400_eta002.model              submission_nn_xgb
bst_400_eta005.model              train_pivot_45678_to_9_whole_zero.csv
bst_551_eta02.model               train_pivot_56789_to_10_44fea.pickle
bst_600_eta001.model              train_pivot_56789_to_10_44fea_zero.pickle
bst_600_eta002.model              train_pivot_56789_to_10_new.pickle
bst_600_eta005.model              train_pivot_6789_to_11_new.pickle
bst_800_eta002.model              train_pivot_xgb_time1_44fea.csv
bst_use_all_train.model           train_pivot_xgb_time1_44fea_zero.csv
canal_for_cliente_producto.csv    train_pivot_xgb_time1.pickle
old_submission/                   train_pivot_xgb_time2_38fea.csv
origin/



In [3]:

    
dtypes = {'Semana' : 'int32',
          'Agencia_ID' :'int32',
          'Canal_ID' : 'int32',
          'Ruta_SAK' : 'int32',
          'Cliente-ID' : 'int32',
          'Producto_ID':'int32',
          'Venta_hoy':'float32',
          'Venta_uni_hoy': 'int32',
          'Dev_uni_proxima':'int32',
          'Dev_proxima':'float32',
          'Demanda_uni_equil':'int32'}



In [4]:

    
train_dataset = pd.read_csv('origin/train.csv',
                            usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID','Demanda_uni_equil'],
                            dtype  = dtypes)
train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
train_dataset.drop(['Demanda_uni_equil'],axis = 1,inplace = True)
train_dataset.head()









    Out[4]:






  
    
      
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
      Cliente_ID
      Producto_ID
      log_demand
    
  
  
    
      0
      3
      1110
      7
      3301
      15766
      1212
      1.386294
    
    
      1
      3
      1110
      7
      3301
      15766
      1216
      1.609438
    
    
      2
      3
      1110
      7
      3301
      15766
      1238
      1.609438
    
    
      3
      3
      1110
      7
      3301
      15766
      1240
      1.609438
    
    
      4
      3
      1110
      7
      3301
      15766
      1242
      1.386294



In [8]:

    
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})
pivot_train.head()









    Out[8]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      2
      7212
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      2
      7212
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      2
      4189
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      2
      4189
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      2
      7212



In [9]:

    
pivot_train_zero = pivot_train.fillna(0)
pivot_train_zero.head()









    Out[9]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      1182
      3.688879
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      2061
      2
      7212
    
    
      1
      26
      4767
      3.761200
      0.000000
      0.000000
      0.000000
      3.761200
      3.761200
      0.000000
      2061
      2
      7212
    
    
      2
      26
      30235
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      4.574711
      2655
      2
      4189
    
    
      3
      26
      30314
      0.000000
      0.000000
      0.000000
      0.000000
      3.891820
      0.000000
      0.000000
      2655
      2
      4189
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      2
      7212



In [5]:

    
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.rename(columns = {'Semana':'sem10_sem11'},inplace = True)
pivot_test.shape









    Out[5]:





(6999251, 14)



In [6]:

    
pivot_test['Cliente_ID'] = pivot_test['Cliente_ID'].astype(np.int32)
pivot_test['Producto_ID'] = pivot_test['Producto_ID'].astype(np.int32)



In [7]:

    
pivot_test.head()









    Out[7]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      31518
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      1569352
      10
      2655
      2
      4189
    
    
      1
      26
      34206
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      1547831
      11
      2061
      2
      7212
    
    
      2
      26
      34210
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      6667200
      10
      2061
      2
      7212
    
    
      3
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      1592616
      10
      2061
      2
      7212
    
    
      4
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      6825659
      11
      2061
      2
      7212



In [27]:

    
pivot_test.columns.values









    Out[27]:





array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'id', 'sem10_sem11', 'Agencia_ID', 'Canal_ID',
       'Ruta_SAK'], dtype=object)

make the train_pivot, duplicate exist when index = ['Cliente','Producto']

for each cliente & producto, first find its most common Agencia_ID, Canal_ID, Ruta_SAK



In [29]:

    
agencia_for_cliente_producto = train_dataset[['Cliente_ID','Producto_ID'
                                              ,'Agencia_ID']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
canal_for_cliente_producto = train_dataset[['Cliente_ID',
                                            'Producto_ID','Canal_ID']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
ruta_for_cliente_producto = train_dataset[['Cliente_ID',
                                           'Producto_ID','Ruta_SAK']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()



In [22]:

    
gc.collect()









    Out[22]:





775



In [37]:

    
agencia_for_cliente_producto.to_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto.to_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto.to_pickle('ruta_for_cliente_producto.csv')



In [8]:

    
agencia_for_cliente_producto = pd.read_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto = pd.read_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto = pd.read_pickle('ruta_for_cliente_producto.csv')



In [6]:

    
# train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
pivot_train = pd.pivot_table(data= train_dataset[['Cliente_ID','Producto_ID','log_demand','Semana']],
                                        values='log_demand', index=['Cliente_ID','Producto_ID'],
                                        columns=['Semana'], aggfunc=np.mean,fill_value = 0).reset_index()



In [5]:

    
pivot_train.head()









    Out[5]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      3
      4
      5
      6
      7
      8
      9
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      1182
      3.688879
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      2061
      2
      7212
    
    
      1
      26
      4767
      3.761200
      0.000000
      0.000000
      0.000000
      3.761200
      3.761200
      0.000000
      2061
      2
      7212
    
    
      2
      26
      30235
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      4.574711
      2655
      2
      4189
    
    
      3
      26
      30314
      0.000000
      0.000000
      0.000000
      0.000000
      3.891820
      0.000000
      0.000000
      2655
      2
      4189
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      2
      7212



In [9]:

    
pivot_train = pd.merge(left = pivot_train, right = agencia_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = canal_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = ruta_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])



In [11]:

    
pivot_train.to_pickle('pivot_train_with_zero.pickle')



In [4]:

    
pivot_train = pd.read_pickle('pivot_train_with_zero.pickle')



In [15]:

    
pivot_train.to_pickle('pivot_train_with_nan.pickle')



In [3]:

    
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')



In [15]:

    
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})



In [16]:

    
pivot_train.head()









    Out[16]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      1182
      3.688879
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      2061
      2
      7212
    
    
      1
      26
      4767
      3.761200
      0.000000
      0.000000
      0.000000
      3.761200
      3.761200
      0.000000
      2061
      2
      7212
    
    
      2
      26
      30235
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      4.574711
      2655
      2
      4189
    
    
      3
      26
      30314
      0.000000
      0.000000
      0.000000
      0.000000
      3.891820
      0.000000
      0.000000
      2655
      2
      4189
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      2
      7212



In [8]:

    
pivot_train.columns.values









    Out[8]:





array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK'], dtype=object)

make pivot table of test



In [7]:

    
test_dataset = pd.read_csv('origin/test.csv')
test_dataset.head()









    Out[7]:






  
    
      
      id
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
      Cliente_ID
      Producto_ID
    
  
  
    
      0
      0
      11
      4037
      1
      2209
      4639078
      35305
    
    
      1
      1
      11
      2237
      1
      1226
      4705135
      1238
    
    
      2
      2
      10
      2045
      1
      2831
      4549769
      32940
    
    
      3
      3
      11
      1227
      1
      4448
      4717855
      43066
    
    
      4
      4
      11
      1219
      1
      1130
      966351
      1277



In [7]:

    
test_dataset[test_dataset['Semana'] == 10].shape









    Out[7]:





(3538385, 7)



In [8]:

    
test_dataset[test_dataset['Semana'] == 11].shape









    Out[8]:





(3460866, 7)



In [74]:

    
pivot_test = pd.merge(left=pivot_train, right = test_dataset[['id','Cliente_ID','Producto_ID','Semana']],
                      on =['Cliente_ID','Producto_ID'],how = 'inner' )
pivot_test.head()









    Out[74]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
      id
      Semana
    
  
  
    
      0
      26
      31518
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      2655
      2
      4189
      1569352
      10
    
    
      1
      26
      34206
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      2061
      2
      7212
      1547831
      11
    
    
      2
      26
      34210
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      2061
      2
      7212
      6667200
      10
    
    
      3
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      1592616
      10
    
    
      4
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      6825659
      11



In [8]:

    
pivot_test_new = pd.merge(pivot_train[['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9']],right = test_dataset, on = ['Cliente_ID','Producto_ID'],how = 'right')



In [9]:

    
pivot_test_new.head()









    Out[9]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26.0
      31518.0
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      1569352
      10
      2655
      2
      4189
    
    
      1
      26.0
      34206.0
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      1547831
      11
      2061
      2
      7212
    
    
      2
      26.0
      34210.0
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      6667200
      10
      2061
      2
      7212
    
    
      3
      26.0
      34785.0
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      1592616
      10
      2061
      2
      7212
    
    
      4
      26.0
      34785.0
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      6825659
      11
      2061
      2
      7212



In [10]:

    
pivot_test_new.to_pickle('pivot_test.pickle')



In [75]:

    
pivot_test.to_pickle('pivot_test.pickle')



In [4]:

    
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.head()









    Out[4]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
      id
      Semana
    
  
  
    
      0
      26
      31518
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      2655
      2
      4189
      1569352
      10
    
    
      1
      26
      34206
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      2061
      2
      7212
      1547831
      11
    
    
      2
      26
      34210
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      2061
      2
      7212
      6667200
      10
    
    
      3
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      1592616
      10
    
    
      4
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      6825659
      11

groupby use Agencia_ID, Ruta_SAK, Cliente_ID, Producto_ID



In [52]:

    
train_dataset.head()









    Out[52]:






  
    
      
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
      Cliente_ID
      Producto_ID
      Demanda_uni_equil
      log_demand
    
  
  
    
      0
      3
      1110
      7
      3301
      15766
      1212
      3
      1.386294
    
    
      1
      3
      1110
      7
      3301
      15766
      1216
      4
      1.609438
    
    
      2
      3
      1110
      7
      3301
      15766
      1238
      4
      1.609438
    
    
      3
      3
      1110
      7
      3301
      15766
      1240
      4
      1.609438
    
    
      4
      3
      1110
      7
      3301
      15766
      1242
      3
      1.386294



In [70]:

    
import itertools
col_list = ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
all_combine = itertools.combinations(col_list,2)



In [71]:

    
list_2element_combine = [list(tuple) for tuple in all_combine]



In [75]:

    
col_1elm_2elm = col_list + list_2element_combine
col_1elm_2elm









    Out[75]:





['Agencia_ID',
 'Ruta_SAK',
 'Cliente_ID',
 'Producto_ID',
 ['Agencia_ID', 'Ruta_SAK'],
 ['Agencia_ID', 'Cliente_ID'],
 ['Agencia_ID', 'Producto_ID'],
 ['Ruta_SAK', 'Cliente_ID'],
 ['Ruta_SAK', 'Producto_ID'],
 ['Cliente_ID', 'Producto_ID']]



In [79]:

    
train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()

if predict week 8, use data from 3,4,5,6,7

if predict week 9, use data from 3,4,5,6,7



In [11]:

    
def categorical_useful(train_dataset,pivot_train):
#     if is_train:
#         train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()
#     elif is_train == False:
    train_dataset_test = train_dataset.copy()
        
    log_demand_by_agen = train_dataset_test[['Agencia_ID','log_demand']].groupby('Agencia_ID').mean().reset_index()
    log_demand_by_ruta = train_dataset_test[['Ruta_SAK','log_demand']].groupby('Ruta_SAK').mean().reset_index()
    log_demand_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').mean().reset_index()
    log_demand_by_producto = train_dataset_test[['Producto_ID','log_demand']].groupby('Producto_ID').mean().reset_index()
    
    log_demand_by_agen_ruta = train_dataset_test[['Agencia_ID', 'Ruta_SAK',
                                                  'log_demand']].groupby(['Agencia_ID', 'Ruta_SAK']).mean().reset_index()
    log_demand_by_agen_cliente = train_dataset_test[['Agencia_ID', 'Cliente_ID',
                                                     'log_demand']].groupby(['Agencia_ID', 'Cliente_ID']).mean().reset_index()
    log_demand_by_agen_producto = train_dataset_test[['Agencia_ID', 'Producto_ID',
                                                      'log_demand']].groupby(['Agencia_ID', 'Producto_ID']).mean().reset_index()
    log_demand_by_ruta_cliente = train_dataset_test[['Ruta_SAK', 'Cliente_ID',
                                                     'log_demand']].groupby(['Ruta_SAK', 'Cliente_ID']).mean().reset_index()
    log_demand_by_ruta_producto = train_dataset_test[['Ruta_SAK', 'Producto_ID',
                                                      'log_demand']].groupby(['Ruta_SAK', 'Producto_ID']).mean().reset_index()
    log_demand_by_cliente_producto = train_dataset_test[['Cliente_ID', 'Producto_ID',
                                                     'log_demand']].groupby(['Cliente_ID', 'Producto_ID']).mean().reset_index()
    
    log_demand_by_cliente_producto_agen = train_dataset_test[[
            'Cliente_ID','Producto_ID','Agencia_ID','log_demand']].groupby(['Cliente_ID',
                                                                            'Agencia_ID','Producto_ID']).mean().reset_index()
    
    log_sum_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').sum().reset_index()
    
    ruta_freq_semana = train_dataset[['Semana','Ruta_SAK']].groupby(['Ruta_SAK']).count().reset_index()
    clien_freq_semana = train_dataset[['Semana','Cliente_ID']].groupby(['Cliente_ID']).count().reset_index()
    agen_freq_semana = train_dataset[['Semana','Agencia_ID']].groupby(['Agencia_ID']).count().reset_index()
    prod_freq_semana = train_dataset[['Semana','Producto_ID']].groupby(['Producto_ID']).count().reset_index()
    
    
    pivot_train = pd.merge(left = pivot_train,right = ruta_freq_semana,
                           how = 'left', on = ['Ruta_SAK']).rename(columns={'Semana': 'ruta_freq'})
    pivot_train = pd.merge(left = pivot_train,right = clien_freq_semana,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'Semana': 'clien_freq'})
    pivot_train = pd.merge(left = pivot_train,right = agen_freq_semana,
                           how = 'left', on = ['Agencia_ID']).rename(columns={'Semana': 'agen_freq'})
    pivot_train = pd.merge(left = pivot_train,right = prod_freq_semana,
                           how = 'left', on = ['Producto_ID']).rename(columns={'Semana': 'prod_freq'})
    
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen,
                           how = 'left', on = ['Agencia_ID']).rename(columns={'log_demand': 'agen_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta,
                           how = 'left', on = ['Ruta_SAK']).rename(columns={'log_demand': 'ruta_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_cliente,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_producto,
                           how = 'left', on = ['Producto_ID']).rename(columns={'log_demand': 'producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_ruta,
                           how = 'left', on = ['Agencia_ID', 'Ruta_SAK']).rename(columns={'log_demand': 'agen_ruta_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_cliente,
                           how = 'left', on = ['Agencia_ID', 'Cliente_ID']).rename(columns={'log_demand': 'agen_cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_producto,
                           how = 'left', on = ['Agencia_ID', 'Producto_ID']).rename(columns={'log_demand': 'agen_producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta_cliente,
                           how = 'left', on = ['Ruta_SAK', 'Cliente_ID']).rename(columns={'log_demand': 'ruta_cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta_producto,
                           how = 'left', on = ['Ruta_SAK', 'Producto_ID']).rename(columns={'log_demand': 'ruta_producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_cliente_producto,
                           how = 'left', on = ['Cliente_ID', 'Producto_ID']).rename(columns={'log_demand': 'cliente_producto_for_log_de'})
    
    pivot_train = pd.merge(left = pivot_train,
                           right = log_sum_by_cliente,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_sum'})
    
    pivot_train = pd.merge(left = pivot_train,
                       right = log_demand_by_cliente_producto_agen,
                       how = 'left', on = ['Cliente_ID', 'Producto_ID',
                                           'Agencia_ID']).rename(columns={'log_demand': 'cliente_producto_agen_for_log_sum'})
    
    
    pivot_train['corr'] = pivot_train['producto_for_log_de'] * pivot_train['cliente_for_log_de'] / train_dataset_test['log_demand'].median()

    return pivot_train



In [12]:

    
def define_time_features(df, to_predict = 't_plus_1' , t_0 = 8):
    if(to_predict == 't_plus_1' ):
        df['t_min_1'] = df['Sem'+str(t_0-1)]
    if(to_predict == 't_plus_2' ):
        df['t_min_6'] = df['Sem'+str(t_0-6)]
      
    df['t_min_2'] = df['Sem'+str(t_0-2)]
    df['t_min_3'] = df['Sem'+str(t_0-3)]
    df['t_min_4'] = df['Sem'+str(t_0-4)]
    df['t_min_5'] = df['Sem'+str(t_0-5)]
    
    
    if(to_predict == 't_plus_1' ):
        df['t1_min_t2'] =  df['t_min_1'] - df['t_min_2']
        df['t1_min_t3'] =  df['t_min_1'] - df['t_min_3']
        df['t1_min_t4'] =  df['t_min_1'] - df['t_min_4']
        df['t1_min_t5'] =  df['t_min_1'] - df['t_min_5']
    
    if(to_predict == 't_plus_2' ):
        df['t2_min_t6'] =  df['t_min_2'] - df['t_min_6']
        df['t3_min_t6'] =  df['t_min_3'] - df['t_min_6']
        df['t4_min_t6'] =  df['t_min_4'] - df['t_min_6']
        df['t5_min_t6'] =  df['t_min_5'] - df['t_min_6']

    df['t2_min_t3'] =  df['t_min_2'] - df['t_min_3']
    df['t2_min_t4'] =  df['t_min_2'] - df['t_min_4']
    df['t2_min_t5'] =  df['t_min_2'] - df['t_min_5']

    df['t3_min_t4'] =  df['t_min_3'] - df['t_min_4']
    df['t3_min_t5'] =  df['t_min_3'] - df['t_min_5']

    df['t4_min_t5'] =  df['t_min_4'] - df['t_min_5']
        
    return df



In [13]:

    
def lin_regr(row, to_predict, t_0, semanas_numbers):
    row = row.copy()
    row.index = semanas_numbers
    row = row.dropna()
    if(len(row>2)):
        X = np.ones(shape=(len(row), 2))
        X[:,1] = row.index
        y = row.values
        regr = linear_model.LinearRegression()
        regr.fit(X, y)
        if(to_predict == 't_plus_1'):
            return regr.predict([[1,t_0+1]])[0]
        elif(to_predict == 't_plus_2'):
            return regr.predict([[1,t_0+2]])[0]
    else:
        return None



In [14]:

    
def lin_regr_features(pivot_df,to_predict, semanas_numbers,t_0):
    pivot_df = pivot_df.copy()
    semanas_names = ['Sem%i' %i for i in semanas_numbers]
    columns = ['Sem%i' %i for i in semanas_numbers]
    columns.append('Producto_ID')
    pivot_grouped = pivot_df[columns].groupby('Producto_ID').aggregate('mean')
    pivot_grouped['LR_prod'] = np.zeros(len(pivot_grouped))
    pivot_grouped['LR_prod'] = pivot_grouped[semanas_names].apply(lin_regr, axis = 1,
                                                                  to_predict = to_predict,
                                                                  t_0 = t_0, semanas_numbers = semanas_numbers )
    pivot_df = pd.merge(pivot_df, pivot_grouped[['LR_prod']], how='left', left_on = 'Producto_ID', right_index=True)
    pivot_df['LR_prod_corr'] = pivot_df['LR_prod'] * pivot_df['cliente_for_log_sum'] / 100
    return pivot_df



In [15]:

    
cliente_tabla = pd.read_csv('origin/cliente_tabla.csv')
town_state = pd.read_csv('origin/town_state.csv')



In [16]:

    
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)



In [17]:

    
def add_pro_info(dataset):
    train_basic_feature = dataset[['Cliente_ID','Producto_ID','Agencia_ID']].copy()
    train_basic_feature.drop_duplicates(inplace = True)

    cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
#     print cliente_per_town.shape
    cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )

#     print cliente_per_town.shape

    cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
#     print cliente_per_town_count.head()

    cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','town_id','Agencia_ID']],
                                            cliente_per_town_count,on = 'town_id',how = 'inner')

#     print cliente_per_town_count_final.head()
    cliente_per_town_count_final.drop_duplicates(inplace = True)
    
    dataset_final = pd.merge(dataset,cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente','Agencia_ID']],
                             on = ['Cliente_ID','Producto_ID','Agencia_ID'],how = 'left')
    return dataset_final



In [18]:

    
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)

pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')



In [19]:

    
def add_product(dataset):

    dataset = pd.merge(dataset,pre_product[['ID','weight','weight_per_piece','pieces']],
                                                             left_on = 'Producto_ID',right_on = 'ID',how = 'left')
    return dataset

data for predict week [34567----9], time plus 2 week



In [19]:

    
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_9 = pivot_train_zero.loc[(pivot_train['Sem9'].notnull()),:].copy()

train_pivot_34567_to_9 = categorical_useful(train_34567,train_pivot_34567_to_9)

del train_34567
gc.collect()

train_pivot_34567_to_9 = define_time_features(train_pivot_34567_to_9, to_predict = 't_plus_2' , t_0 = 9)
train_pivot_34567_to_9 = lin_regr_features(train_pivot_34567_to_9,to_predict ='t_plus_2',
                                           semanas_numbers = [3,4,5,6,7],t_0 = 9)

train_pivot_34567_to_9['target'] = train_pivot_34567_to_9['Sem9']
train_pivot_34567_to_9.drop(['Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_9[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)

train_pivot_34567_to_9.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)

train_pivot_34567_to_9 = pd.concat([train_pivot_34567_to_9,train_pivot_cum_sum],axis =1)

train_pivot_34567_to_9 = train_pivot_34567_to_9.rename(columns={'Sem3': 't_m_6_cum',
                                          'Sem4': 't_m_5_cum','Sem5': 't_m_4_cum',
                                                                'Sem6': 't_m_3_cum','Sem7': 't_m_2_cum'})
# add geo_info
train_pivot_34567_to_9 = add_pro_info(train_pivot_34567_to_9)

#add product info
train_pivot_34567_to_9 = add_product(train_pivot_34567_to_9)
train_pivot_34567_to_9.drop(['ID'],axis = 1,inplace = True)

gc.collect()

train_pivot_34567_to_9.head()









    Out[19]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      target
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26
      30235
      2655
      2
      4189
      11.0
      160.0
      4017
      2159.0
      3.936934
      3.095841
      2.897803
      3.106304
      3.095841
      3.015667
      3.569138
      3.095841
      NaN
      NaN
      463.648478
      NaN
      6.493178
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.730772
      3.388213
      4.574711
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      233
      600.0
      75.00
      8.0
    
    
      1
      26
      31393
      2061
      2
      7212
      548.0
      160.0
      12574
      15736.0
      3.489749
      3.208232
      2.897803
      1.563950
      2.877003
      2.877003
      3.333444
      2.877003
      3.013416
      2.873470
      463.648478
      2.873470
      3.269161
      3.044522
      2.944439
      2.772589
      2.772589
      2.833213
      -0.100083
      -0.271934
      -0.271934
      -0.211309
      0.171850
      0.171850
      0.111226
      0.000000
      -0.060625
      -0.060625
      1.208998
      5.605499
      2.639057
      3.044522
      5.877736
      8.650325
      11.422913
      14.367352
      18989
      640.0
      NaN
      NaN
    
    
      2
      26
      32962
      2061
      2
      7212
      548.0
      160.0
      12574
      7432.0
      3.489749
      3.208232
      2.897803
      2.637196
      2.877003
      2.877003
      2.350336
      2.877003
      1.893925
      1.386294
      463.648478
      1.386294
      5.512592
      1.386294
      0.000000
      0.000000
      0.000000
      0.000000
      -1.386294
      -1.386294
      -1.386294
      -1.386294
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.917894
      8.892288
      2.397895
      1.386294
      1.386294
      1.386294
      1.386294
      1.386294
      18989
      180.0
      30.00
      6.0
    
    
      3
      26
      34204
      2061
      2
      7212
      548.0
      160.0
      12574
      17261.0
      3.489749
      3.208232
      2.897803
      2.674463
      2.877003
      2.877003
      3.848593
      2.877003
      3.819687
      3.638221
      463.648478
      3.638221
      5.590491
      3.784190
      3.610918
      3.555348
      3.806662
      3.433987
      -0.173272
      -0.228842
      0.022473
      -0.350202
      0.055570
      -0.195745
      0.176931
      -0.251314
      0.121361
      0.372675
      2.474379
      11.472421
      3.784190
      3.784190
      7.218177
      11.024839
      14.580187
      18.191105
      18989
      450.0
      56.25
      8.0
    
    
      4
      26
      34206
      2061
      2
      7212
      548.0
      160.0
      12574
      87803.0
      3.489749
      3.208232
      2.897803
      1.730373
      2.877003
      2.877003
      4.836591
      2.877003
      4.628871
      4.357383
      463.648478
      4.357383
      3.617039
      4.795791
      4.574711
      4.248495
      3.761200
      4.406719
      -0.221080
      -0.547295
      -1.034590
      -0.389071
      0.326216
      0.813511
      0.167992
      0.487295
      -0.158224
      -0.645519
      0.383450
      1.777862
      4.682131
      4.795791
      9.202510
      12.963710
      17.212205
      21.786916
      18989
      340.0
      42.50
      8.0



In [40]:

    
train_pivot_34567_to_9.columns.values









    Out[40]:





array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_6', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't2_min_t6', 't3_min_t6',
       't4_min_t6', 't5_min_t6', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_6_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [23]:

    
len(train_pivot_34567_to_9.columns.values)









    Out[23]:





49



In [21]:

    
train_pivot_34567_to_9.to_csv('train_pivot_34567_to_9.csv')



In [22]:

    
train_pivot_34567_to_9 = pd.read_csv('train_pivot_34567_to_9.csv',index_col = 0)

test_for private data, week 11



In [28]:

    
pivot_test.head()









    Out[28]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      31518
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      1569352
      10
      2655
      2
      4189
    
    
      1
      26
      34206
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      1547831
      11
      2061
      2
      7212
    
    
      2
      26
      34210
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      6667200
      10
      2061
      2
      7212
    
    
      3
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      1592616
      10
      2061
      2
      7212
    
    
      4
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      6825659
      11
      2061
      2
      7212



In [29]:

    
pivot_test_week11 = pivot_test.loc[pivot_test['sem10_sem11'] == 11]
pivot_test_week11.reset_index(drop=True,inplace = True)
pivot_test_week11 = pivot_test_week11.fillna(0)
pivot_test_week11.head()









    Out[29]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      34206
      4.795791
      4.406719
      3.761200
      4.248495
      4.574711
      4.499810
      4.682131
      1547831
      11
      2061
      2
      7212
    
    
      1
      26
      34785
      3.091042
      0.000000
      0.000000
      2.833213
      3.332205
      2.564949
      3.218876
      6825659
      11
      2061
      2
      7212
    
    
      2
      26
      35148
      2.772589
      2.772589
      1.791759
      2.772589
      2.397895
      3.044522
      2.995732
      5853787
      11
      2061
      2
      7212
    
    
      3
      26
      43335
      1.945910
      1.386294
      0.000000
      0.000000
      0.000000
      0.000000
      1.386294
      2316053
      11
      2061
      2
      7212
    
    
      4
      26
      43342
      1.386294
      2.197225
      1.098612
      0.000000
      0.000000
      1.791759
      3.044522
      900676
      11
      2061
      2
      7212



In [30]:

    
pivot_test_week11.shape









    Out[30]:





(3460866, 14)



In [31]:

    
train_56789 = train_dataset.loc[train_dataset['Semana'].isin([5,6,7,8,9]), :].copy()
train_pivot_56789_to_11 = pivot_test_week11.copy()

train_pivot_56789_to_11 = categorical_useful(train_56789,train_pivot_56789_to_11)

del train_56789
gc.collect()

train_pivot_56789_to_11 = define_time_features(train_pivot_56789_to_11, to_predict = 't_plus_2' , t_0 = 11)

train_pivot_56789_to_11 = lin_regr_features(train_pivot_56789_to_11,to_predict ='t_plus_2' , 
                                            semanas_numbers = [5,6,7,8,9],t_0 = 9)

train_pivot_56789_to_11.drop(['Sem3','Sem4'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_56789_to_11[['Sem5','Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)

train_pivot_56789_to_11.drop(['Sem5','Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)

train_pivot_56789_to_11 = pd.concat([train_pivot_56789_to_11,train_pivot_cum_sum],axis =1)


train_pivot_56789_to_11 = train_pivot_56789_to_11.rename(columns={'Sem5': 't_m_6_cum',
                                          'Sem6': 't_m_5_cum','Sem7': 't_m_4_cum',
                                                                  'Sem8': 't_m_3_cum','Sem9': 't_m_2_cum'})
# add product_info
train_pivot_56789_to_11 = add_pro_info(train_pivot_56789_to_11)

#
train_pivot_56789_to_11 = add_product(train_pivot_56789_to_11)

train_pivot_56789_to_11.drop(['ID'],axis =1,inplace = True)

for col in train_pivot_56789_to_11.columns.values:
    train_pivot_56789_to_11[col] = train_pivot_56789_to_11[col].astype(np.float32)

train_pivot_56789_to_11.head()









    Out[31]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      34206.0
      1547831.0
      11.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      91578.0
      3.523074
      3.222417
      2.890955
      1.724493
      2.835826
      2.835826
      4.809507
      2.835826
      4.642291
      4.353270
      479.898621
      4.353270
      3.596229
      3.761200
      4.682131
      4.499810
      4.574711
      4.248495
      0.920931
      0.738610
      0.813511
      0.487295
      0.182322
      0.107420
      0.433636
      -0.074901
      0.251314
      0.326216
      1.536319
      7.372772
      3.761200
      8.009695
      12.584406
      17.084215
      21.766348
      5954.0
      340.0
      42.500
      8.0
    
    
      1
      26.0
      34785.0
      6825659.0
      11.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      10780.0
      3.523074
      3.222417
      2.890955
      3.987424
      2.835826
      2.835826
      3.836502
      2.835826
      3.786542
      2.987311
      479.898621
      2.987311
      8.315309
      0.000000
      3.218876
      2.564949
      3.332205
      2.833213
      3.218876
      2.564949
      3.332205
      2.833213
      0.653926
      -0.113329
      0.385662
      -0.767255
      -0.268264
      0.498991
      3.918295
      18.803841
      0.000000
      2.833213
      6.165418
      8.730368
      11.949243
      5954.0
      480.0
      NaN
      NaN
    
    
      2
      26.0
      35148.0
      5853787.0
      11.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      11047.0
      3.523074
      3.222417
      2.890955
      2.694399
      2.835826
      2.835826
      3.035542
      2.835826
      2.942251
      2.600500
      479.898621
      2.600500
      5.618855
      1.791759
      2.995732
      3.044523
      2.397895
      2.772589
      1.203973
      1.252763
      0.606136
      0.980829
      -0.048790
      0.597837
      0.223144
      0.646627
      0.271934
      -0.374693
      2.439828
      11.708699
      1.791759
      4.564348
      6.962244
      10.006766
      13.002499
      5954.0
      567.0
      NaN
      NaN
    
    
      3
      26.0
      43335.0
      2316053.0
      11.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      4912.0
      3.523074
      3.222417
      2.890955
      2.392465
      2.835826
      2.835826
      2.128167
      2.835826
      1.912965
      0.693147
      479.898621
      0.693147
      4.989207
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      1.386294
      1.386294
      0.000000
      0.000000
      0.000000
      2.195412
      10.535754
      0.000000
      0.000000
      0.000000
      0.000000
      1.386294
      5954.0
      435.0
      54.375
      8.0
    
    
      4
      26.0
      43342.0
      900676.0
      11.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      10614.0
      3.523074
      3.222417
      2.890955
      2.745177
      2.835826
      2.835826
      2.862706
      2.835826
      2.438047
      1.978298
      479.898621
      1.978298
      5.724748
      1.098612
      3.044523
      1.791759
      0.000000
      0.000000
      1.945910
      0.693147
      -1.098612
      -1.098612
      1.252763
      3.044523
      3.044523
      1.791759
      1.791759
      0.000000
      2.509016
      12.040730
      1.098612
      1.098612
      1.098612
      2.890372
      5.934894
      5954.0
      248.0
      62.000
      4.0



In [32]:

    
train_pivot_56789_to_11.columns.values









    Out[32]:





array(['Cliente_ID', 'Producto_ID', 'id', 'sem10_sem11', 'Agencia_ID',
       'Canal_ID', 'Ruta_SAK', 'ruta_freq', 'clien_freq', 'agen_freq',
       'prod_freq', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_6', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't2_min_t6', 't3_min_t6',
       't4_min_t6', 't5_min_t6', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       't_m_6_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [33]:

    
train_pivot_56789_to_11.shape









    Out[33]:





(3460866, 50)



In [25]:

    
new_feature = ['id', 'ruta_freq', 'clien_freq', 'agen_freq',
       'prod_freq', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_6', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't2_min_t6', 't3_min_t6',
       't4_min_t6', 't5_min_t6', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       't_m_6_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces']



In [35]:

    
len(new_feature)









    Out[35]:





44



In [36]:

    
train_pivot_56789_to_11 = train_pivot_56789_to_11[new_feature]
train_pivot_56789_to_11.head()









    Out[36]:






  
    
      Semana
      id
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      1547831.0
      713.0
      166.0
      12208.0
      91578.0
      3.523074
      3.222417
      2.890955
      1.724493
      2.835826
      2.835826
      4.809507
      2.835826
      4.642291
      4.353270
      479.898621
      4.353270
      3.596229
      3.761200
      4.682131
      4.499810
      4.574711
      4.248495
      0.920931
      0.738610
      0.813511
      0.487295
      0.182322
      0.107420
      0.433636
      -0.074901
      0.251314
      0.326216
      1.536319
      7.372772
      3.761200
      8.009695
      12.584406
      17.084215
      21.766348
      5954.0
      340.0
      42.500
      8.0
    
    
      1
      6825659.0
      713.0
      166.0
      12208.0
      10780.0
      3.523074
      3.222417
      2.890955
      3.987424
      2.835826
      2.835826
      3.836502
      2.835826
      3.786542
      2.987311
      479.898621
      2.987311
      8.315309
      0.000000
      3.218876
      2.564949
      3.332205
      2.833213
      3.218876
      2.564949
      3.332205
      2.833213
      0.653926
      -0.113329
      0.385662
      -0.767255
      -0.268264
      0.498991
      3.918295
      18.803841
      0.000000
      2.833213
      6.165418
      8.730368
      11.949243
      5954.0
      480.0
      NaN
      NaN
    
    
      2
      5853787.0
      713.0
      166.0
      12208.0
      11047.0
      3.523074
      3.222417
      2.890955
      2.694399
      2.835826
      2.835826
      3.035542
      2.835826
      2.942251
      2.600500
      479.898621
      2.600500
      5.618855
      1.791759
      2.995732
      3.044523
      2.397895
      2.772589
      1.203973
      1.252763
      0.606136
      0.980829
      -0.048790
      0.597837
      0.223144
      0.646627
      0.271934
      -0.374693
      2.439828
      11.708699
      1.791759
      4.564348
      6.962244
      10.006766
      13.002499
      5954.0
      567.0
      NaN
      NaN
    
    
      3
      2316053.0
      713.0
      166.0
      12208.0
      4912.0
      3.523074
      3.222417
      2.890955
      2.392465
      2.835826
      2.835826
      2.128167
      2.835826
      1.912965
      0.693147
      479.898621
      0.693147
      4.989207
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      1.386294
      1.386294
      0.000000
      0.000000
      0.000000
      2.195412
      10.535754
      0.000000
      0.000000
      0.000000
      0.000000
      1.386294
      5954.0
      435.0
      54.375
      8.0
    
    
      4
      900676.0
      713.0
      166.0
      12208.0
      10614.0
      3.523074
      3.222417
      2.890955
      2.745177
      2.835826
      2.835826
      2.862706
      2.835826
      2.438047
      1.978298
      479.898621
      1.978298
      5.724748
      1.098612
      3.044523
      1.791759
      0.000000
      0.000000
      1.945910
      0.693147
      -1.098612
      -1.098612
      1.252763
      3.044523
      3.044523
      1.791759
      1.791759
      0.000000
      2.509016
      12.040730
      1.098612
      1.098612
      1.098612
      2.890372
      5.934894
      5954.0
      248.0
      62.000
      4.0



In [37]:

    
train_pivot_56789_to_11['id'] = train_pivot_56789_to_11['id'].astype(int)



In [38]:

    
train_pivot_56789_to_11.head()









    Out[38]:






  
    
      Semana
      id
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      1547831
      713.0
      166.0
      12208.0
      91578.0
      3.523074
      3.222417
      2.890955
      1.724493
      2.835826
      2.835826
      4.809507
      2.835826
      4.642291
      4.353270
      479.898621
      4.353270
      3.596229
      3.761200
      4.682131
      4.499810
      4.574711
      4.248495
      0.920931
      0.738610
      0.813511
      0.487295
      0.182322
      0.107420
      0.433636
      -0.074901
      0.251314
      0.326216
      1.536319
      7.372772
      3.761200
      8.009695
      12.584406
      17.084215
      21.766348
      5954.0
      340.0
      42.500
      8.0
    
    
      1
      6825659
      713.0
      166.0
      12208.0
      10780.0
      3.523074
      3.222417
      2.890955
      3.987424
      2.835826
      2.835826
      3.836502
      2.835826
      3.786542
      2.987311
      479.898621
      2.987311
      8.315309
      0.000000
      3.218876
      2.564949
      3.332205
      2.833213
      3.218876
      2.564949
      3.332205
      2.833213
      0.653926
      -0.113329
      0.385662
      -0.767255
      -0.268264
      0.498991
      3.918295
      18.803841
      0.000000
      2.833213
      6.165418
      8.730368
      11.949243
      5954.0
      480.0
      NaN
      NaN
    
    
      2
      5853787
      713.0
      166.0
      12208.0
      11047.0
      3.523074
      3.222417
      2.890955
      2.694399
      2.835826
      2.835826
      3.035542
      2.835826
      2.942251
      2.600500
      479.898621
      2.600500
      5.618855
      1.791759
      2.995732
      3.044523
      2.397895
      2.772589
      1.203973
      1.252763
      0.606136
      0.980829
      -0.048790
      0.597837
      0.223144
      0.646627
      0.271934
      -0.374693
      2.439828
      11.708699
      1.791759
      4.564348
      6.962244
      10.006766
      13.002499
      5954.0
      567.0
      NaN
      NaN
    
    
      3
      2316053
      713.0
      166.0
      12208.0
      4912.0
      3.523074
      3.222417
      2.890955
      2.392465
      2.835826
      2.835826
      2.128167
      2.835826
      1.912965
      0.693147
      479.898621
      0.693147
      4.989207
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      0.000000
      0.000000
      0.000000
      1.386294
      1.386294
      1.386294
      0.000000
      0.000000
      0.000000
      2.195412
      10.535754
      0.000000
      0.000000
      0.000000
      0.000000
      1.386294
      5954.0
      435.0
      54.375
      8.0
    
    
      4
      900676
      713.0
      166.0
      12208.0
      10614.0
      3.523074
      3.222417
      2.890955
      2.745177
      2.835826
      2.835826
      2.862706
      2.835826
      2.438047
      1.978298
      479.898621
      1.978298
      5.724748
      1.098612
      3.044523
      1.791759
      0.000000
      0.000000
      1.945910
      0.693147
      -1.098612
      -1.098612
      1.252763
      3.044523
      3.044523
      1.791759
      1.791759
      0.000000
      2.509016
      12.040730
      1.098612
      1.098612
      1.098612
      2.890372
      5.934894
      5954.0
      248.0
      62.000
      4.0



In [39]:

    
train_pivot_56789_to_11.to_csv('train_pivot_56789_to_11_private.csv',index = False)

for two week ahead 45678 to 10



In [8]:

    
pivot_test_week10 = pivot_test.loc[pivot_test['sem10_sem11'] == 10]
pivot_test_week10.reset_index(drop=True,inplace = True)
pivot_test_week10 = pivot_test_week10.fillna(0)
pivot_test_week10.head()









    Out[8]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      31518
      0.000000
      2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1569352
      10
      2655
      2
      4189
    
    
      1
      26
      34210
      2.890372
      3.761200
      3.784190
      3.496508
      3.433987
      3.688879
      3.688879
      6667200
      10
      2061
      2
      7212
    
    
      2
      26
      34785
      3.091042
      0.000000
      0.000000
      2.833213
      3.332205
      2.564949
      3.218876
      1592616
      10
      2061
      2
      7212
    
    
      3
      26
      34786
      4.356709
      4.158883
      4.430817
      3.713572
      4.317488
      4.094345
      4.330733
      3909690
      10
      2061
      2
      7212
    
    
      4
      26
      35142
      3.258097
      1.945910
      3.583519
      3.555348
      3.688879
      3.828641
      3.663562
      3659672
      10
      2061
      2
      7212



In [23]:

    
train_45678 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7,8]), :].copy()
train_pivot_45678_to_10 = pivot_test_week10.copy()

train_pivot_45678_to_10 = categorical_useful(train_45678,train_pivot_45678_to_10)

del train_45678
gc.collect()

train_pivot_45678_to_10 = define_time_features(train_pivot_45678_to_10, to_predict = 't_plus_2' , t_0 = 10)

train_pivot_45678_to_10 = lin_regr_features(train_pivot_45678_to_10,to_predict ='t_plus_2' , 
                                            semanas_numbers = [4,5,6,7,8],t_0 = 8)

train_pivot_45678_to_10.drop(['Sem3','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_45678_to_10[['Sem4','Sem5','Sem6','Sem7','Sem8']].cumsum(axis = 1)

train_pivot_45678_to_10.drop(['Sem4','Sem5','Sem6','Sem7','Sem8'],axis =1,inplace = True)

train_pivot_45678_to_10 = pd.concat([train_pivot_45678_to_10,train_pivot_cum_sum],axis =1)


train_pivot_45678_to_10 = train_pivot_45678_to_10.rename(columns={'Sem4': 't_m_6_cum',
                                          'Sem5': 't_m_5_cum','Sem6': 't_m_4_cum',
                                                                  'Sem7': 't_m_3_cum','Sem8': 't_m_2_cum'})
# add product_info
train_pivot_45678_to_10 = add_pro_info(train_pivot_45678_to_10)

#
train_pivot_45678_to_10 = add_product(train_pivot_45678_to_10)

train_pivot_45678_to_10.drop(['ID'],axis =1,inplace = True)

for col in train_pivot_45678_to_10.columns.values:
    train_pivot_45678_to_10[col] = train_pivot_45678_to_10[col].astype(np.float32)

train_pivot_45678_to_10.head()









    Out[23]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      31518.0
      1569352.0
      10.0
      2655.0
      2.0
      4189.0
      10.0
      158.0
      3861.0
      169.0
      3.951774
      3.165635
      2.83547
      2.493198
      3.165635
      3.033128
      2.475491
      3.165635
      2.397895
      2.397895
      448.004242
      2.397895
      5.099486
      2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      -2.397895
      -2.397895
      -2.397895
      -2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.301235
      -1.349546
      2.397895
      2.397895
      2.397895
      2.397895
      2.397895
      131.0
      691.0
      NaN
      NaN
    
    
      1
      26.0
      34210.0
      6667200.0
      10.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      10746.0
      3.509364
      3.211487
      2.83547
      4.374220
      2.805172
      2.805172
      4.490424
      2.805172
      3.720000
      3.632953
      448.004242
      3.632953
      8.946852
      3.761200
      3.688879
      3.433987
      3.496508
      3.784190
      -0.072321
      -0.327213
      -0.264693
      0.022990
      0.254892
      0.192372
      -0.095310
      -0.062520
      -0.350202
      -0.287682
      4.428435
      19.839575
      3.761200
      7.545390
      11.041898
      14.475884
      18.164764
      6027.0
      740.0
      NaN
      NaN
    
    
      2
      26.0
      34785.0
      1592616.0
      10.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      10736.0
      3.509364
      3.211487
      2.83547
      3.987994
      2.805172
      2.805172
      3.839655
      2.805172
      3.789903
      2.910122
      448.004242
      2.910122
      8.156881
      0.000000
      2.564949
      3.332205
      2.833213
      0.000000
      2.564949
      3.332205
      2.833213
      0.000000
      -0.767255
      -0.268264
      2.564949
      0.498991
      3.332205
      2.833213
      3.992911
      17.888412
      0.000000
      0.000000
      2.833213
      6.165418
      8.730368
      6027.0
      480.0
      NaN
      NaN
    
    
      3
      26.0
      34786.0
      3909690.0
      10.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      13169.0
      3.509364
      3.211487
      2.83547
      4.535940
      2.805172
      2.805172
      4.738372
      2.805172
      4.336529
      4.143021
      448.004242
      4.143021
      9.277627
      4.158883
      4.094345
      4.317488
      3.713572
      4.430817
      -0.064539
      0.158605
      -0.445311
      0.271934
      -0.223144
      0.380773
      -0.336472
      0.603916
      -0.113329
      -0.717245
      4.513783
      20.221937
      4.158883
      8.589700
      12.303272
      16.620760
      20.715105
      6027.0
      680.0
      NaN
      NaN
    
    
      4
      26.0
      35142.0
      3659672.0
      10.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      10624.0
      3.509364
      3.211487
      2.83547
      3.268587
      2.805172
      2.805172
      4.077493
      2.805172
      3.415331
      3.320460
      448.004242
      3.320460
      6.685434
      1.945910
      3.828641
      3.688879
      3.555348
      3.583519
      1.882731
      1.742969
      1.609438
      1.637609
      0.139762
      0.273293
      0.245122
      0.133531
      0.105361
      -0.028171
      2.966826
      13.291507
      1.945910
      5.529429
      9.084777
      12.773657
      16.602299
      6027.0
      567.0
      NaN
      NaN



In [24]:

    
train_pivot_45678_to_10.columns.values









    Out[24]:





array(['Cliente_ID', 'Producto_ID', 'id', 'sem10_sem11', 'Agencia_ID',
       'Canal_ID', 'Ruta_SAK', 'ruta_freq', 'clien_freq', 'agen_freq',
       'prod_freq', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_6', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't2_min_t6', 't3_min_t6',
       't4_min_t6', 't5_min_t6', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       't_m_6_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [26]:

    
train_pivot_45678_to_10 = train_pivot_45678_to_10[new_feature]
train_pivot_45678_to_10['id'] = train_pivot_45678_to_10['id'].astype(int)
train_pivot_45678_to_10.head()









    Out[26]:






  
    
      Semana
      id
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_6
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t2_min_t6
      t3_min_t6
      t4_min_t6
      t5_min_t6
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_6_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      1569352
      10.0
      158.0
      3861.0
      169.0
      3.951774
      3.165635
      2.83547
      2.493198
      3.165635
      3.033128
      2.475491
      3.165635
      2.397895
      2.397895
      448.004242
      2.397895
      5.099486
      2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      -2.397895
      -2.397895
      -2.397895
      -2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.301235
      -1.349546
      2.397895
      2.397895
      2.397895
      2.397895
      2.397895
      131.0
      691.0
      NaN
      NaN
    
    
      1
      6667200
      658.0
      158.0
      12429.0
      10746.0
      3.509364
      3.211487
      2.83547
      4.374220
      2.805172
      2.805172
      4.490424
      2.805172
      3.720000
      3.632953
      448.004242
      3.632953
      8.946852
      3.761200
      3.688879
      3.433987
      3.496508
      3.784190
      -0.072321
      -0.327213
      -0.264693
      0.022990
      0.254892
      0.192372
      -0.095310
      -0.062520
      -0.350202
      -0.287682
      4.428435
      19.839575
      3.761200
      7.545390
      11.041898
      14.475884
      18.164764
      6027.0
      740.0
      NaN
      NaN
    
    
      2
      1592616
      658.0
      158.0
      12429.0
      10736.0
      3.509364
      3.211487
      2.83547
      3.987994
      2.805172
      2.805172
      3.839655
      2.805172
      3.789903
      2.910122
      448.004242
      2.910122
      8.156881
      0.000000
      2.564949
      3.332205
      2.833213
      0.000000
      2.564949
      3.332205
      2.833213
      0.000000
      -0.767255
      -0.268264
      2.564949
      0.498991
      3.332205
      2.833213
      3.992911
      17.888412
      0.000000
      0.000000
      2.833213
      6.165418
      8.730368
      6027.0
      480.0
      NaN
      NaN
    
    
      3
      3909690
      658.0
      158.0
      12429.0
      13169.0
      3.509364
      3.211487
      2.83547
      4.535940
      2.805172
      2.805172
      4.738372
      2.805172
      4.336529
      4.143021
      448.004242
      4.143021
      9.277627
      4.158883
      4.094345
      4.317488
      3.713572
      4.430817
      -0.064539
      0.158605
      -0.445311
      0.271934
      -0.223144
      0.380773
      -0.336472
      0.603916
      -0.113329
      -0.717245
      4.513783
      20.221937
      4.158883
      8.589700
      12.303272
      16.620760
      20.715105
      6027.0
      680.0
      NaN
      NaN
    
    
      4
      3659672
      658.0
      158.0
      12429.0
      10624.0
      3.509364
      3.211487
      2.83547
      3.268587
      2.805172
      2.805172
      4.077493
      2.805172
      3.415331
      3.320460
      448.004242
      3.320460
      6.685434
      1.945910
      3.828641
      3.688879
      3.555348
      3.583519
      1.882731
      1.742969
      1.609438
      1.637609
      0.139762
      0.273293
      0.245122
      0.133531
      0.105361
      -0.028171
      2.966826
      13.291507
      1.945910
      5.529429
      9.084777
      12.773657
      16.602299
      6027.0
      567.0
      NaN
      NaN



In [27]:

    
train_pivot_45678_to_10.to_pickle('validation_45678_10.pickle')



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

data for predict week 8&9, time plus 1 week

train_45678 for 8+1 =9



In [17]:

    
train_45678 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7,8]), :].copy()
train_pivot_45678_to_9 = pivot_train_zero.loc[(pivot_train['Sem9'].notnull()),:].copy()


train_pivot_45678_to_9 = categorical_useful(train_45678,train_pivot_45678_to_9)
train_pivot_45678_to_9 = define_time_features(train_pivot_45678_to_9, to_predict = 't_plus_1' , t_0 = 9)


del train_45678
gc.collect()

train_pivot_45678_to_9 = lin_regr_features(train_pivot_45678_to_9,to_predict ='t_plus_1',
                                           semanas_numbers = [4,5,6,7,8],t_0 = 8)

train_pivot_45678_to_9['target'] = train_pivot_45678_to_9['Sem9']
train_pivot_45678_to_9.drop(['Sem3','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_45678_to_9[['Sem4','Sem5','Sem6','Sem7','Sem8']].cumsum(axis = 1)

train_pivot_45678_to_9.drop(['Sem4','Sem5','Sem6','Sem7','Sem8'],axis =1,inplace = True)

train_pivot_45678_to_9 = pd.concat([train_pivot_45678_to_9,train_pivot_cum_sum],axis =1,copy = False)

train_pivot_45678_to_9 = train_pivot_45678_to_9.rename(columns={'Sem4': 't_m_5_cum',
                                          'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum','Sem8': 't_m_1_cum'})
# add geo_info
train_pivot_45678_to_9 = add_pro_info(train_pivot_45678_to_9)

#add product info
train_pivot_45678_to_9 = add_product(train_pivot_45678_to_9)
train_pivot_45678_to_9.drop(['ID'],axis = 1,inplace = True)

for col in train_pivot_45678_to_9.columns.values:
    train_pivot_45678_to_9[col] = train_pivot_45678_to_9[col].astype(np.float32)

gc.collect()

train_pivot_45678_to_9.head()









    Out[17]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_1
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t1_min_t2
      t1_min_t3
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      target
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      t_m_1_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      30235.0
      2655.0
      2.0
      4189.0
      10.0
      158.0
      3861.0
      2062.0
      3.951774
      3.165635
      2.83547
      3.122565
      3.165635
      3.033128
      3.516439
      3.165635
      NaN
      NaN
      448.004242
      NaN
      6.386767
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.314400
      5.888566
      4.574711
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      233.0
      600.0
      75.00
      8.0
    
    
      1
      26.0
      31393.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      16321.0
      3.509364
      3.211487
      2.83547
      1.557049
      2.805172
      2.805172
      3.350600
      2.805172
      3.125611
      2.891665
      448.004242
      2.891665
      3.184724
      3.135494
      2.944439
      2.772589
      2.772589
      2.833213
      0.191055
      0.362906
      0.362906
      0.302281
      0.171850
      0.171850
      0.111226
      0.000000
      -0.060625
      -0.060625
      1.269118
      5.685703
      2.639057
      2.833213
      5.605802
      8.378390
      11.322830
      14.458324
      18989.0
      640.0
      NaN
      NaN
    
    
      2
      26.0
      32962.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      7375.0
      3.509364
      3.211487
      2.83547
      2.643195
      2.805172
      2.805172
      2.410218
      2.805172
      2.225570
      NaN
      448.004242
      NaN
      5.406284
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      2.042073
      9.148576
      2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      18989.0
      180.0
      30.00
      6.0
    
    
      3
      26.0
      34204.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      17155.0
      3.509364
      3.211487
      2.83547
      2.691697
      2.805172
      2.805172
      3.825037
      2.805172
      3.796168
      3.647111
      448.004242
      3.647111
      5.505487
      3.828641
      3.610918
      3.555348
      3.806663
      3.433987
      0.217723
      0.273293
      0.021979
      0.394654
      0.055570
      -0.195745
      0.176931
      -0.251314
      0.121361
      0.372675
      2.526538
      11.318999
      3.784190
      3.433987
      7.240650
      10.795998
      14.406916
      18.235558
      18989.0
      450.0
      56.25
      8.0
    
    
      4
      26.0
      34206.0
      2061.0
      2.0
      7212.0
      658.0
      158.0
      12429.0
      80184.0
      3.509364
      3.211487
      2.83547
      1.734170
      2.805172
      2.805172
      4.830257
      2.805172
      4.632751
      4.298187
      448.004242
      4.298187
      3.547001
      4.499810
      4.574711
      4.248495
      3.761200
      4.406719
      -0.074901
      0.251314
      0.738610
      0.093090
      0.326216
      0.813511
      0.167992
      0.487295
      -0.158224
      -0.645519
      0.990179
      4.436046
      4.682131
      4.406719
      8.167919
      12.416414
      16.991125
      21.490934
      18989.0
      340.0
      42.50
      8.0



In [19]:

    
train_pivot_45678_to_9.columns.values









    Out[19]:





array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [20]:

    
train_pivot_45678_to_9 = train_pivot_45678_to_9[['ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces']]



In [21]:

    
train_pivot_45678_to_9.shape









    Out[21]:





(10385350, 44)



In [22]:

    
train_pivot_45678_to_9.to_csv('train_pivot_45678_to_9_whole_zero.csv')



In [23]:

    
# train_pivot_45678_to_9_old = pd.read_csv('train_pivot_45678_to_9.csv',index_col = 0)



In [30]:

    
sum(train_pivot_45678_to_9['target'].isnull())









    Out[30]:





15446188

train_34567 7+1 = 8



In [28]:

    
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_8 = pivot_train_zero.loc[(pivot_train['Sem8'].notnull()),:].copy()

train_pivot_34567_to_8 = categorical_useful(train_34567,train_pivot_34567_to_8)
train_pivot_34567_to_8 = define_time_features(train_pivot_34567_to_8, to_predict = 't_plus_1' , t_0 = 8)

del train_34567
gc.collect()

train_pivot_34567_to_8 = lin_regr_features(train_pivot_34567_to_8,to_predict = 't_plus_1',
                                           semanas_numbers = [3,4,5,6,7],t_0 = 7)

train_pivot_34567_to_8['target'] = train_pivot_34567_to_8['Sem8']
train_pivot_34567_to_8.drop(['Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_8[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)

train_pivot_34567_to_8.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)

train_pivot_34567_to_8 = pd.concat([train_pivot_34567_to_8,train_pivot_cum_sum],axis =1)

train_pivot_34567_to_8 = train_pivot_34567_to_8.rename(columns={'Sem3': 't_m_5_cum','Sem4': 't_m_4_cum',
                                          'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum',
                                                                'Sem7': 't_m_1_cum'})
# add product_info
train_pivot_34567_to_8 = add_pro_info(train_pivot_34567_to_8)

#add product

train_pivot_34567_to_8 = add_product(train_pivot_34567_to_8)

train_pivot_34567_to_8.drop(['ID'],axis = 1,inplace = True)

for col in train_pivot_34567_to_8.columns.values:
    train_pivot_34567_to_8[col] = train_pivot_34567_to_8[col].astype(np.float32)

gc.collect()

train_pivot_34567_to_8.head()









    Out[28]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_1
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t1_min_t2
      t1_min_t3
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      target
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      t_m_1_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      1182.0
      2061.0
      2.0
      7212.0
      548.0
      160.0
      12574.0
      411493.0
      3.489749
      3.208231
      2.897803
      1.766994
      2.877003
      2.877003
      4.370318
      2.877003
      3.694196
      1.844440
      463.648468
      1.844440
      3.693589
      0.000000
      0.000000
      0.000000
      0.000000
      3.688879
      0.000000
      0.000000
      0.000000
      -3.688879
      0.000000
      0.000000
      -3.688879
      0.000000
      -3.688879
      -3.688879
      1.127138
      5.225957
      0.000000
      3.688879
      3.688879
      3.688879
      3.688879
      3.688879
      18434.0
      210.0
      210.00
      1.0
    
    
      1
      26.0
      4767.0
      2061.0
      2.0
      7212.0
      548.0
      160.0
      12574.0
      112080.0
      3.489749
      3.208231
      2.897803
      1.482496
      2.877003
      2.877003
      4.253512
      2.877003
      4.164880
      3.761200
      463.648468
      3.761200
      3.098896
      3.761200
      0.000000
      0.000000
      0.000000
      3.761200
      3.761200
      3.761200
      3.761200
      0.000000
      0.000000
      0.000000
      -3.761200
      0.000000
      -3.761200
      -3.761200
      0.768180
      3.561656
      3.761200
      3.761200
      3.761200
      3.761200
      3.761200
      7.522400
      18434.0
      250.0
      NaN
      NaN
    
    
      2
      26.0
      31393.0
      2061.0
      2.0
      7212.0
      548.0
      160.0
      12574.0
      15736.0
      3.489749
      3.208231
      2.897803
      1.563950
      2.877003
      2.877003
      3.333444
      2.877003
      3.013416
      2.873471
      463.648468
      2.873471
      3.269161
      2.944439
      2.772589
      2.772589
      2.833213
      3.044523
      0.171850
      0.171850
      0.111226
      -0.100083
      0.000000
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.298646
      6.021154
      3.135494
      3.044523
      5.877736
      8.650325
      11.422914
      14.367352
      18434.0
      640.0
      NaN
      NaN
    
    
      3
      26.0
      34204.0
      2061.0
      2.0
      7212.0
      548.0
      160.0
      12574.0
      17261.0
      3.489749
      3.208231
      2.897803
      2.674463
      2.877003
      2.877003
      3.848593
      2.877003
      3.819687
      3.638221
      463.648468
      3.638221
      5.590491
      3.610918
      3.555348
      3.806663
      3.433987
      3.784190
      0.055570
      -0.195745
      0.176931
      -0.173272
      -0.251314
      0.121361
      -0.228842
      0.372675
      0.022473
      -0.350202
      2.560121
      11.869964
      3.828641
      3.784190
      7.218177
      11.024839
      14.580188
      18.191105
      18434.0
      450.0
      56.25
      8.0
    
    
      4
      26.0
      34206.0
      2061.0
      2.0
      7212.0
      548.0
      160.0
      12574.0
      87803.0
      3.489749
      3.208231
      2.897803
      1.730373
      2.877003
      2.877003
      4.836590
      2.877003
      4.628871
      4.357383
      463.648468
      4.357383
      3.617039
      4.574711
      4.248495
      3.761200
      4.406719
      4.795791
      0.326216
      0.813511
      0.167992
      -0.221080
      0.487295
      -0.158224
      -0.547295
      -0.645519
      -1.034590
      -0.389071
      1.379329
      6.395236
      4.499810
      4.795791
      9.202510
      12.963710
      17.212206
      21.786917
      18434.0
      340.0
      42.50
      8.0



In [29]:

    
train_pivot_34567_to_8.shape









    Out[29]:





(10383302, 49)



In [30]:

    
train_pivot_34567_to_8.columns.values









    Out[30]:





array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [50]:

    
train_pivot_34567_to_8.to_csv('train_pivot_34567_to_8.csv')



In [5]:

    
train_pivot_34567_to_8 = pd.read_csv('train_pivot_34567_to_8.csv',index_col = 0)



In [20]:

    
gc.collect()









    Out[20]:





42

concat train_pivot_45678_to_9 & train_pivot_34567_to_8 to perform t_plus_1, train_data is over



In [31]:

    
train_pivot_xgb_time1 = pd.concat([train_pivot_45678_to_9, train_pivot_34567_to_8],axis = 0,copy = False)



In [32]:

    
train_pivot_xgb_time1 = train_pivot_xgb_time1[['ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces']]



In [33]:

    
train_pivot_xgb_time1.columns.values









    Out[33]:





array(['ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [34]:

    
train_pivot_xgb_time1.shape









    Out[34]:





(20768652, 44)



In [54]:

    
np.sum(train_pivot_xgb_time1.memory_usage())/(1024**3)









    Out[54]:





8



In [35]:

    
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1_44fea_zero.csv',index = False)



In [54]:

    
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1.csv')



In [26]:

    
del train_pivot_xgb_time1
del train_pivot_45678_to_9
del train_pivot_34567_to_8
gc.collect()









    Out[26]:





16

prepare for test data, for week 10, we use 5,6,7,8,9



In [7]:

    
pivot_test.head()









    Out[7]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      Canal_ID
      Ruta_SAK
      id
      Semana
    
  
  
    
      0
      26
      31518
      NaN
      2.397895
      NaN
      NaN
      NaN
      NaN
      NaN
      2655
      2
      4189
      1569352
      10
    
    
      1
      26
      34206
      4.795791
      4.406719
      3.76120
      4.248495
      4.574711
      4.499810
      4.682131
      2061
      2
      7212
      1547831
      11
    
    
      2
      26
      34210
      2.890372
      3.761200
      3.78419
      3.496508
      3.433987
      3.688879
      3.688879
      2061
      2
      7212
      6667200
      10
    
    
      3
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      1592616
      10
    
    
      4
      26
      34785
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      2061
      2
      7212
      6825659
      11



In [40]:

    
pivot_test_week10 = pivot_test.loc[pivot_test['sem10_sem11'] == 10]
pivot_test_week10.reset_index(drop=True,inplace = True)
pivot_test_week10 = pivot_test_week10.fillna(0)
pivot_test_week10.head()









    Out[40]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26
      31518
      0.000000
      2.397895
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1569352
      10
      2655
      2
      4189
    
    
      1
      26
      34210
      2.890372
      3.761200
      3.784190
      3.496508
      3.433987
      3.688879
      3.688879
      6667200
      10
      2061
      2
      7212
    
    
      2
      26
      34785
      3.091042
      0.000000
      0.000000
      2.833213
      3.332205
      2.564949
      3.218876
      1592616
      10
      2061
      2
      7212
    
    
      3
      26
      34786
      4.356709
      4.158883
      4.430817
      3.713572
      4.317488
      4.094345
      4.330733
      3909690
      10
      2061
      2
      7212
    
    
      4
      26
      35142
      3.258097
      1.945910
      3.583519
      3.555348
      3.688879
      3.828641
      3.663562
      3659672
      10
      2061
      2
      7212



In [41]:

    
pivot_test_week10.shape









    Out[41]:





(3538385, 14)



In [42]:

    
train_56789 = train_dataset.loc[train_dataset['Semana'].isin([5,6,7,8,9]), :].copy()
train_pivot_56789_to_10 = pivot_test_week10.copy()

train_pivot_56789_to_10 = categorical_useful(train_56789,train_pivot_56789_to_10)

del train_56789
gc.collect()

train_pivot_56789_to_10 = define_time_features(train_pivot_56789_to_10, to_predict = 't_plus_1' , t_0 = 10)

train_pivot_56789_to_10 = lin_regr_features(train_pivot_56789_to_10,to_predict ='t_plus_1' , 
                                            semanas_numbers = [5,6,7,8,9],t_0 = 9)

train_pivot_56789_to_10.drop(['Sem3','Sem4'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_56789_to_10[['Sem5','Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)

train_pivot_56789_to_10.drop(['Sem5','Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)

train_pivot_56789_to_10 = pd.concat([train_pivot_56789_to_10,train_pivot_cum_sum],axis =1)


train_pivot_56789_to_10 = train_pivot_56789_to_10.rename(columns={'Sem5': 't_m_5_cum',
                                          'Sem6': 't_m_4_cum','Sem7': 't_m_3_cum',
                                                                  'Sem8': 't_m_2_cum','Sem9': 't_m_1_cum'})
# add product_info
train_pivot_56789_to_10 = add_pro_info(train_pivot_56789_to_10)

#
train_pivot_56789_to_10 = add_product(train_pivot_56789_to_10)

train_pivot_56789_to_10.drop(['ID'],axis =1,inplace = True)

for col in train_pivot_56789_to_10.columns.values:
    train_pivot_56789_to_10[col] = train_pivot_56789_to_10[col].astype(np.float32)

train_pivot_56789_to_10.head()









    Out[42]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      id
      sem10_sem11
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_1
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t1_min_t2
      t1_min_t3
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      t_m_1_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      31518.0
      1569352.0
      10.0
      2655.0
      2.0
      4189.0
      10.0
      166.0
      3604.0
      143.0
      4.001407
      3.411275
      2.890955
      2.498162
      3.411275
      3.217137
      2.499994
      3.411275
      NaN
      NaN
      479.898621
      NaN
      5.209625
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.022223
      0.106650
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      131.0
      691.0
      NaN
      NaN
    
    
      1
      26.0
      34210.0
      6667200.0
      10.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      10842.0
      3.523074
      3.222417
      2.890955
      4.361940
      2.835826
      2.835826
      4.469590
      2.835826
      3.826318
      3.618489
      479.898621
      3.618489
      9.096318
      3.688879
      3.688879
      3.433987
      3.496508
      3.784190
      0.000000
      0.254892
      0.192372
      -0.095310
      0.254892
      0.192372
      -0.095310
      -0.062520
      -0.350202
      -0.287682
      4.399127
      21.111349
      3.784190
      7.280697
      10.714684
      14.403563
      18.092443
      6027.0
      740.0
      NaN
      NaN
    
    
      2
      26.0
      34785.0
      1592616.0
      10.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      10780.0
      3.523074
      3.222417
      2.890955
      3.987424
      2.835826
      2.835826
      3.836502
      2.835826
      3.786542
      2.987311
      479.898621
      2.987311
      8.315309
      3.218876
      2.564949
      3.332205
      2.833213
      0.000000
      0.653926
      -0.113329
      0.385662
      3.218876
      -0.767255
      -0.268264
      2.564949
      0.498991
      3.332205
      2.833213
      3.982773
      19.113272
      0.000000
      2.833213
      6.165418
      8.730368
      11.949243
      6027.0
      480.0
      NaN
      NaN
    
    
      3
      26.0
      34786.0
      3909690.0
      10.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      13222.0
      3.523074
      3.222417
      2.890955
      4.529289
      2.835826
      2.835826
      4.721229
      2.835826
      4.455762
      4.177391
      479.898621
      4.177391
      9.445304
      4.330733
      4.094345
      4.317488
      3.713572
      4.430817
      0.236389
      0.013245
      0.617161
      -0.100083
      -0.223144
      0.380773
      -0.336472
      0.603916
      -0.113329
      -0.717245
      4.488874
      21.542048
      4.430817
      8.144389
      12.461877
      16.556221
      20.886955
      6027.0
      680.0
      NaN
      NaN
    
    
      4
      26.0
      35142.0
      3659672.0
      10.0
      2061.0
      2.0
      7212.0
      713.0
      166.0
      12208.0
      10881.0
      3.523074
      3.222417
      2.890955
      3.238592
      2.835826
      2.835826
      4.112633
      2.835826
      3.512973
      3.663990
      479.898621
      3.663990
      6.753706
      3.663562
      3.828641
      3.688879
      3.555348
      3.583519
      -0.165080
      -0.025318
      0.108214
      0.080043
      0.139762
      0.273293
      0.245122
      0.133531
      0.105361
      -0.028171
      3.015531
      14.471490
      3.583519
      7.138867
      10.827746
      14.656388
      18.319950
      6027.0
      567.0
      NaN
      NaN



In [44]:

    
train_pivot_56789_to_10  = train_pivot_56789_to_10[['id','ruta_freq', 'clien_freq', 'agen_freq',
       'prod_freq', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum', 't_m_1_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces']]



In [48]:

    
train_pivot_56789_to_10.head()









    Out[48]:






  
    
      Semana
      id
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      cliente_producto_agen_for_log_sum
      corr
      t_min_1
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      t1_min_t2
      t1_min_t3
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      t_m_1_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      1569352.0
      10.0
      166.0
      3604.0
      143.0
      4.001407
      3.411275
      2.890955
      2.498162
      3.411275
      3.217137
      2.499994
      3.411275
      NaN
      NaN
      479.898621
      NaN
      5.209625
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.022223
      0.106650
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      131.0
      691.0
      NaN
      NaN
    
    
      1
      6667200.0
      713.0
      166.0
      12208.0
      10842.0
      3.523074
      3.222417
      2.890955
      4.361940
      2.835826
      2.835826
      4.469590
      2.835826
      3.826318
      3.618489
      479.898621
      3.618489
      9.096318
      3.688879
      3.688879
      3.433987
      3.496508
      3.784190
      0.000000
      0.254892
      0.192372
      -0.095310
      0.254892
      0.192372
      -0.095310
      -0.062520
      -0.350202
      -0.287682
      4.399127
      21.111349
      3.784190
      7.280697
      10.714684
      14.403563
      18.092443
      6027.0
      740.0
      NaN
      NaN
    
    
      2
      1592616.0
      713.0
      166.0
      12208.0
      10780.0
      3.523074
      3.222417
      2.890955
      3.987424
      2.835826
      2.835826
      3.836502
      2.835826
      3.786542
      2.987311
      479.898621
      2.987311
      8.315309
      3.218876
      2.564949
      3.332205
      2.833213
      0.000000
      0.653926
      -0.113329
      0.385662
      3.218876
      -0.767255
      -0.268264
      2.564949
      0.498991
      3.332205
      2.833213
      3.982773
      19.113272
      0.000000
      2.833213
      6.165418
      8.730368
      11.949243
      6027.0
      480.0
      NaN
      NaN
    
    
      3
      3909690.0
      713.0
      166.0
      12208.0
      13222.0
      3.523074
      3.222417
      2.890955
      4.529289
      2.835826
      2.835826
      4.721229
      2.835826
      4.455762
      4.177391
      479.898621
      4.177391
      9.445304
      4.330733
      4.094345
      4.317488
      3.713572
      4.430817
      0.236389
      0.013245
      0.617161
      -0.100083
      -0.223144
      0.380773
      -0.336472
      0.603916
      -0.113329
      -0.717245
      4.488874
      21.542048
      4.430817
      8.144389
      12.461877
      16.556221
      20.886955
      6027.0
      680.0
      NaN
      NaN
    
    
      4
      3659672.0
      713.0
      166.0
      12208.0
      10881.0
      3.523074
      3.222417
      2.890955
      3.238592
      2.835826
      2.835826
      4.112633
      2.835826
      3.512973
      3.663990
      479.898621
      3.663990
      6.753706
      3.663562
      3.828641
      3.688879
      3.555348
      3.583519
      -0.165080
      -0.025318
      0.108214
      0.080043
      0.139762
      0.273293
      0.245122
      0.133531
      0.105361
      -0.028171
      3.015531
      14.471490
      3.583519
      7.138867
      10.827746
      14.656388
      18.319950
      6027.0
      567.0
      NaN
      NaN



In [49]:

    
train_pivot_56789_to_10.shape









    Out[49]:





(3538385, 44)



In [46]:

    
len(train_pivot_56789_to_10.columns.values)









    Out[46]:





44



In [50]:

    
train_pivot_56789_to_10.to_pickle('train_pivot_56789_to_10_44fea_zero.pickle')

begin predict for week 11

train_3456 for 6+2 = 8



In [27]:

    
train_3456 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6]), :].copy()
train_pivot_3456_to_8 = pivot_train.loc[(pivot_train['Sem8'].notnull()),:].copy()

train_pivot_3456_to_8 = categorical_useful(train_3456,train_pivot_3456_to_8)

del train_3456
gc.collect()

train_pivot_3456_to_8 = define_time_features(train_pivot_3456_to_8, to_predict = 't_plus_2' , t_0 = 8)

#notice that the t_0 means different
train_pivot_3456_to_8 = lin_regr_features(train_pivot_3456_to_8,to_predict = 't_plus_2', semanas_numbers = [3,4,5,6],t_0 = 6)

train_pivot_3456_to_8['target'] = train_pivot_3456_to_8['Sem8']
train_pivot_3456_to_8.drop(['Sem7','Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_3456_to_8[['Sem3','Sem4','Sem5','Sem6']].cumsum(axis = 1)

train_pivot_3456_to_8.drop(['Sem3','Sem4','Sem5','Sem6'],axis =1,inplace = True)

train_pivot_3456_to_8 = pd.concat([train_pivot_3456_to_8,train_pivot_cum_sum],axis =1)

train_pivot_3456_to_8 = train_pivot_3456_to_8.rename(columns={'Sem4': 't_m_4_cum',
                                          'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum', 'Sem3': 't_m_5_cum'})
# add product_info
train_pivot_3456_to_8 = add_pro_info(train_pivot_3456_to_8)

train_pivot_3456_to_8 = add_product(train_pivot_3456_to_8)

train_pivot_3456_to_8.drop(['ID'],axis =1,inplace = True)

train_pivot_3456_to_8.head()









    Out[27]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ...
      target
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      ID
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26
      1182
      2061
      2
      7212
      410.0
      127.0
      10085
      320507.0
      3.491654
      ...
      0.000000
      3.688879
      NaN
      NaN
      NaN
      18434
      1182
      210.0
      210.00
      1.0
    
    
      1
      26
      4767
      2061
      2
      7212
      410.0
      127.0
      10085
      88159.0
      3.491654
      ...
      3.761200
      3.761200
      NaN
      NaN
      NaN
      18434
      4767
      250.0
      NaN
      NaN
    
    
      2
      26
      31393
      2061
      2
      7212
      410.0
      127.0
      10085
      12237.0
      3.491654
      ...
      3.135494
      3.044522
      5.877736
      8.650325
      11.422913
      18434
      31393
      640.0
      NaN
      NaN
    
    
      3
      26
      34204
      2061
      2
      7212
      410.0
      127.0
      10085
      13814.0
      3.491654
      ...
      3.828641
      3.784190
      7.218177
      11.024839
      14.580187
      18434
      34204
      450.0
      56.25
      8.0
    
    
      4
      26
      34206
      2061
      2
      7212
      410.0
      127.0
      10085
      71868.0
      3.491654
      ...
      4.499810
      4.795791
      9.202510
      12.963710
      17.212205
      18434
      34206
      340.0
      42.50
      8.0
    
  

5 rows × 44 columns



In [30]:

    
train_pivot_3456_to_8.columns.values









    Out[30]:





array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [58]:

    
train_pivot_3456_to_8.to_csv('train_pivot_3456_to_8.csv')

train_4567 for 7 + 2 = 9



In [28]:

    
train_4567 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7]), :].copy()
train_pivot_4567_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()

train_pivot_4567_to_9 = categorical_useful(train_4567,train_pivot_4567_to_9)

del train_4567
gc.collect()

train_pivot_4567_to_9 = define_time_features(train_pivot_4567_to_9, to_predict = 't_plus_2' , t_0 = 9)

#notice that the t_0 means different
train_pivot_4567_to_9 = lin_regr_features(train_pivot_4567_to_9,to_predict = 't_plus_2', 
                                          semanas_numbers = [4,5,6,7],t_0 = 7)

train_pivot_4567_to_9['target'] = train_pivot_4567_to_9['Sem9']
train_pivot_4567_to_9.drop(['Sem3','Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_4567_to_9[['Sem7','Sem4','Sem5','Sem6']].cumsum(axis = 1)

train_pivot_4567_to_9.drop(['Sem7','Sem4','Sem5','Sem6'],axis =1,inplace = True)

train_pivot_4567_to_9 = pd.concat([train_pivot_4567_to_9,train_pivot_cum_sum],axis =1)

train_pivot_4567_to_9 = train_pivot_4567_to_9.rename(columns={'Sem4': 't_m_5_cum',
                                          'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum'})
# add product_info
train_pivot_4567_to_9 = add_pro_info(train_pivot_4567_to_9)

train_pivot_4567_to_9 = add_product(train_pivot_4567_to_9)

train_pivot_4567_to_9.drop(['ID'],axis =1,inplace = True)

train_pivot_4567_to_9.head()









    Out[28]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Agencia_ID
      Canal_ID
      Ruta_SAK
      ruta_freq
      clien_freq
      agen_freq
      prod_freq
      agen_for_log_de
      ...
      LR_prod_corr
      target
      t_m_2_cum
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26
      30235
      2655
      2
      4189
      10.0
      126.0
      3322
      1670.0
      3.939269
      ...
      12.376057
      4.574711
      NaN
      NaN
      NaN
      NaN
      233
      600.0
      75.00
      8.0
    
    
      1
      26
      31393
      2061
      2
      7212
      518.0
      126.0
      9992
      12729.0
      3.502839
      ...
      6.542010
      2.639057
      2.944439
      5.777652
      8.550241
      11.322830
      18989
      640.0
      NaN
      NaN
    
    
      2
      26
      32962
      2061
      2
      7212
      518.0
      126.0
      9992
      5940.0
      3.502839
      ...
      10.276987
      2.397895
      NaN
      NaN
      NaN
      NaN
      18989
      180.0
      30.00
      6.0
    
    
      3
      26
      34204
      2061
      2
      7212
      518.0
      126.0
      9992
      13625.0
      3.502839
      ...
      11.267610
      3.784190
      3.610918
      7.044905
      10.851568
      14.406916
      18989
      450.0
      56.25
      8.0
    
    
      4
      26
      34206
      2061
      2
      7212
      518.0
      126.0
      9992
      62752.0
      3.502839
      ...
      7.308716
      4.682131
      4.574711
      8.981430
      12.742630
      16.991126
      18989
      340.0
      42.50
      8.0
    
  

5 rows × 43 columns



In [31]:

    
train_pivot_4567_to_9.columns.values









    Out[31]:





array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_2_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)



In [16]:

    
train_pivot_4567_to_9.to_csv('train_pivot_4567_to_9.csv')

concat



In [32]:

    
train_pivot_xgb_time2 = pd.concat([train_pivot_3456_to_8, train_pivot_4567_to_9],axis = 0,copy = False)



In [33]:

    
train_pivot_xgb_time2.columns.values









    Out[33]:





array(['Agencia_ID', 'Canal_ID', 'Cliente_ID', 'LR_prod', 'LR_prod_corr',
       'NombreCliente', 'Producto_ID', 'Ruta_SAK',
       'agen_cliente_for_log_de', 'agen_for_log_de', 'agen_freq',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de', 'clien_freq',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'cliente_producto_for_log_de',
       'corr', 'pieces', 'prod_freq', 'producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_for_log_de', 'ruta_freq',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'target', 'weight', 'weight_per_piece'], dtype=object)



In [34]:

    
train_pivot_xgb_time2.shape









    Out[34]:





(20768652, 43)



In [35]:

    
train_pivot_xgb_time2.to_csv('train_pivot_xgb_time2_38fea.csv')



In [66]:

    
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)
train_pivot_xgb_time2.head()









    Out[66]:






  
    
      
      Agencia_ID
      Canal_ID
      Cliente_ID
      LR_prod
      LR_prod_corr
      NombreCliente
      Producto_ID
      Ruta_SAK
      agen_cliente_for_log_de
      agen_for_log_de
      ...
      t_m_3_cum
      t_m_4_cum
      t_m_5_cum
      t_min_2
      t_min_3
      t_min_4
      t_min_5
      target
      weight
      weight_per_piece
    
  
  
    
      0
      2061
      2
      26
      2.001190
      7.293554
      18434
      1182
      7212
      2.852285
      3.491654
      ...
      NaN
      NaN
      3.688879
      NaN
      NaN
      NaN
      3.688879
      0.000000
      210.0
      210.00
    
    
      1
      2061
      2
      26
      1.839411
      6.703932
      18434
      4767
      7212
      2.852285
      3.491654
      ...
      NaN
      NaN
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      250.0
      NaN
    
    
      2
      2061
      2
      26
      1.911283
      6.965878
      18434
      31393
      7212
      2.852285
      3.491654
      ...
      8.650325
      5.877736
      3.044522
      2.772589
      2.772589
      2.833213
      3.044522
      3.135494
      640.0
      NaN
    
    
      3
      2061
      2
      26
      3.113374
      11.347029
      18434
      34204
      7212
      2.852285
      3.491654
      ...
      11.024839
      7.218177
      3.784190
      3.555348
      3.806662
      3.433987
      3.784190
      3.828641
      450.0
      56.25
    
    
      4
      2061
      2
      26
      2.031231
      7.403043
      18434
      34206
      7212
      2.852285
      3.491654
      ...
      12.963710
      9.202510
      4.795791
      4.248495
      3.761200
      4.406719
      4.795791
      4.499810
      340.0
      42.50
    
  

5 rows × 38 columns



In [72]:

    
del train_pivot_3456_to_8
del train_pivot_4567_to_9
del train_pivot_xgb_time2
del train_pivot_34567_to_8
del train_pivot_45678_to_9
del train_pivot_xgb_time1
gc.collect()









    Out[72]:





119

for test data week 11, we use 6,7,8,9



In [29]:

    
pivot_test_week11 = pivot_test_new.loc[pivot_test_new['Semana'] == 11]
pivot_test_week11.reset_index(drop=True,inplace = True)
pivot_test_week11.head()









    Out[29]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      id
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
    
  
  
    
      0
      26.0
      34206.0
      4.795791
      4.406719
      3.761200
      4.248495
      4.574711
      4.499810
      4.682131
      1547831
      11
      2061
      2
      7212
    
    
      1
      26.0
      34785.0
      3.091042
      NaN
      NaN
      2.833213
      3.332205
      2.564949
      3.218876
      6825659
      11
      2061
      2
      7212
    
    
      2
      26.0
      35148.0
      2.772589
      2.772589
      1.791759
      2.772589
      2.397895
      3.044522
      2.995732
      5853787
      11
      2061
      2
      7212
    
    
      3
      26.0
      43335.0
      1.945910
      1.386294
      NaN
      NaN
      NaN
      0.000000
      1.386294
      2316053
      11
      2061
      2
      7212
    
    
      4
      26.0
      43342.0
      1.386294
      2.197225
      1.098612
      NaN
      NaN
      1.791759
      3.044522
      900676
      11
      2061
      2
      7212



In [83]:

    
pivot_test_week11.shape









    Out[83]:





(2684949, 14)



In [30]:

    
train_6789 = train_dataset.loc[train_dataset['Semana'].isin([6,7,8,9]), :].copy()
train_pivot_6789_to_11 = pivot_test_week11.copy()

train_pivot_6789_to_11 = categorical_useful(train_6789,train_pivot_6789_to_11)

del train_6789
gc.collect()

train_pivot_6789_to_11 = define_time_features(train_pivot_6789_to_11, to_predict = 't_plus_2' , t_0 = 11)
train_pivot_6789_to_11 = lin_regr_features(train_pivot_6789_to_11,to_predict ='t_plus_2' ,
                                           semanas_numbers = [6,7,8,9],t_0 = 9)

train_pivot_6789_to_11.drop(['Sem3','Sem4','Sem5'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_6789_to_11[['Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)

train_pivot_6789_to_11.drop(['Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)

train_pivot_6789_to_11 = pd.concat([train_pivot_6789_to_11,train_pivot_cum_sum],axis =1)

train_pivot_6789_to_11 = train_pivot_6789_to_11.rename(columns={'Sem6': 't_m_5_cum',
                                          'Sem7': 't_m_4_cum', 'Sem8': 't_m_3_cum','Sem9': 't_m_2_cum'})
# add product_info
train_pivot_6789_to_11 = add_pro_info(train_pivot_6789_to_11)

train_pivot_6789_to_11 = add_product(train_pivot_6789_to_11)

train_pivot_6789_to_11.drop(['ID'],axis = 1,inplace = True)

train_pivot_6789_to_11.head()









    Out[30]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      id
      Semana
      Agencia_ID
      Canal_ID
      Ruta_SAK
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      ...
      LR_prod
      LR_prod_corr
      t_m_5_cum
      t_m_4_cum
      t_m_3_cum
      t_m_2_cum
      NombreCliente
      weight
      weight_per_piece
      pieces
    
  
  
    
      0
      26.0
      34206.0
      1547831
      11
      2061
      2
      7212
      3.535522
      3.224378
      2.939548
      ...
      1.891621
      7.506688
      4.248495
      8.823206
      13.323016
      18.005147
      5954
      340.0
      42.500
      8.0
    
    
      1
      26.0
      34785.0
      6825659
      11
      2061
      2
      7212
      3.535522
      3.224378
      2.939548
      ...
      4.016822
      15.940314
      2.833213
      6.165418
      8.730367
      11.949243
      5954
      480.0
      NaN
      NaN
    
    
      2
      26.0
      35148.0
      5853787
      11
      2061
      2
      7212
      3.535522
      3.224378
      2.939548
      ...
      2.707100
      10.742829
      2.772589
      5.170484
      8.215006
      11.210739
      5954
      567.0
      NaN
      NaN
    
    
      3
      26.0
      43335.0
      2316053
      11
      2061
      2
      7212
      3.535522
      3.224378
      2.939548
      ...
      2.513660
      9.975183
      NaN
      NaN
      0.000000
      1.386294
      5954
      435.0
      54.375
      8.0
    
    
      4
      26.0
      43342.0
      900676
      11
      2061
      2
      7212
      3.535522
      3.224378
      2.939548
      ...
      2.846363
      11.295476
      NaN
      NaN
      1.791759
      4.836282
      5954
      248.0
      62.000
      4.0
    
  

5 rows × 39 columns



In [85]:

    
train_pivot_6789_to_11.shape









    Out[85]:





(2684949, 39)



In [31]:

    
train_pivot_6789_to_11.to_pickle('train_pivot_6789_to_11_new.pickle')

over



In [27]:

    
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)



In [ ]:

    
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)



In [34]:

    
pivot_train_categorical_useful_train.to_csv('pivot_train_categorical_useful_with_nan.csv')



In [2]:

    
pivot_train_categorical_useful_train = pd.read_csv('pivot_train_categorical_useful_with_nan.csv',index_col = 0)
pivot_train_categorical_useful_train.head()









    Out[2]:






  
    
      
      3
      4
      5
      6
      7
      8
      9
      Canal_ID
      agen_for_log_de
      ruta_for_log_de
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
    
  
  
    
      0
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2
      3.489749
      3.208232
      2.897803
      1.766994
      2.877003
      2.877003
      4.370318
      2.877003
      3.694196
      1.84444
      463.648478
    
    
      1
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2
      3.489749
      3.208232
      2.897803
      1.482496
      2.877003
      2.877003
      4.253513
      2.877003
      4.164880
      3.76120
      463.648478
    
    
      2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2
      3.936934
      3.095841
      2.897803
      3.106304
      3.095841
      3.015667
      3.569138
      3.095841
      NaN
      NaN
      463.648478
    
    
      3
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2
      3.936934
      3.095841
      2.897803
      3.878160
      3.095841
      3.015667
      3.943380
      3.095841
      3.891820
      3.89182
      463.648478
    
    
      4
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2
      3.489749
      3.208232
      2.897803
      1.563950
      2.877003
      2.877003
      3.333444
      2.877003
      3.013416
      2.87347
      463.648478

create time feature



In [13]:

    
pivot_train_categorical_useful.head()









    Out[13]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      3
      4
      5
      6
      7
      8
      9
      Agencia_ID
      ...
      cliente_for_log_de
      producto_for_log_de
      agen_ruta_for_log_de
      agen_cliente_for_log_de
      agen_producto_for_log_de
      ruta_cliente_for_log_de
      ruta_producto_for_log_de
      cliente_producto_for_log_de
      cliente_for_log_sum
      corr
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      2.897803
      1.766994
      2.877003
      2.877003
      4.370318
      2.877003
      3.694196
      1.84444
      463.648478
      3.693589
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      2.897803
      1.482496
      2.877003
      2.877003
      4.253513
      2.877003
      4.164880
      3.76120
      463.648478
      3.098896
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      2.897803
      3.106304
      3.095841
      3.015667
      3.569138
      3.095841
      NaN
      NaN
      463.648478
      6.493178
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      2.897803
      3.878160
      3.095841
      3.015667
      3.943380
      3.095841
      3.891820
      3.89182
      463.648478
      8.106606
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      2.897803
      1.563950
      2.877003
      2.877003
      3.333444
      2.877003
      3.013416
      2.87347
      463.648478
      3.269161
    
  

5 rows × 24 columns



In [19]:

    
pivot_train_categorical_useful_time = define_time_features(pivot_train_categorical_useful,
                                                                 to_predict = 't_plus_1' , t_0 = 8)



In [20]:

    
pivot_train_categorical_useful_time.head()









    Out[20]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t1_min_t2
      t1_min_t3
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      -3.688879
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      0.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      0.17185
      0.17185
      0.111226
      -0.100083
      0.0
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
    
  

5 rows × 39 columns



In [22]:

    
pivot_train_categorical_useful_time.columns









    Out[22]:





Index([u'Cliente_ID', u'Producto_ID', u'Sem3', u'Sem4', u'Sem5', u'Sem6',
       u'Sem7', u'Sem8', u'Sem9', u'Agencia_ID', u'Canal_ID', u'Ruta_SAK',
       u'agen_for_log_de', u'ruta_for_log_de', u'cliente_for_log_de',
       u'producto_for_log_de', u'agen_ruta_for_log_de',
       u'agen_cliente_for_log_de', u'agen_producto_for_log_de',
       u'ruta_cliente_for_log_de', u'ruta_producto_for_log_de',
       u'cliente_producto_for_log_de', u'cliente_for_log_sum', u'corr',
       u't_min_1', u't_min_2', u't_min_3', u't_min_4', u't_min_5',
       u't1_min_t2', u't1_min_t3', u't1_min_t4', u't1_min_t5', u't2_min_t3',
       u't2_min_t4', u't2_min_t5', u't3_min_t4', u't3_min_t5', u't4_min_t5'],
      dtype='object', name=u'Semana')

fit mean feature on target



In [33]:

    
# Linear regression features 
pivot_train_categorical_useful_time_LR = lin_regr_features(pivot_train_categorical_useful_time, semanas_numbers = [3,4,5,6,7])
pivot_train_categorical_useful_time_LR.head()









    Out[33]:






  
    
      Semana
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      -3.688879
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.775590
      8.232495
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      0.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.478578
      6.855404
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.099887
      14.372579
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.966329
      18.389824
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      0.111226
      -0.100083
      0.0
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.504393
      6.975095
    
  

5 rows × 41 columns



In [34]:

    
pivot_train_categorical_useful_time_LR.columns









    Out[34]:





Index([u'Cliente_ID', u'Producto_ID', u'Sem3', u'Sem4', u'Sem5', u'Sem6',
       u'Sem7', u'Sem8', u'Sem9', u'Agencia_ID', u'Canal_ID', u'Ruta_SAK',
       u'agen_for_log_de', u'ruta_for_log_de', u'cliente_for_log_de',
       u'producto_for_log_de', u'agen_ruta_for_log_de',
       u'agen_cliente_for_log_de', u'agen_producto_for_log_de',
       u'ruta_cliente_for_log_de', u'ruta_producto_for_log_de',
       u'cliente_producto_for_log_de', u'cliente_for_log_sum', u'corr',
       u't_min_1', u't_min_2', u't_min_3', u't_min_4', u't_min_5',
       u't1_min_t2', u't1_min_t3', u't1_min_t4', u't1_min_t5', u't2_min_t3',
       u't2_min_t4', u't2_min_t5', u't3_min_t4', u't3_min_t5', u't4_min_t5',
       u'LR_prod', u'LR_prod_corr'],
      dtype='object', name=u'Semana')



In [42]:

    
pivot_train_categorical_useful_time_LR.to_csv('pivot_train_categorical_useful_time_LR.csv')



In [16]:

    
pivot_train_categorical_useful_time_LR = pd.read_csv('pivot_train_categorical_useful_time_LR.csv',index_col = 0)



In [17]:

    
pivot_train_categorical_useful_time_LR.head()









    Out[17]:






  
    
      
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t1_min_t4
      t1_min_t5
      t2_min_t3
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      -3.688879
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.775590
      8.232495
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      0.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.478578
      6.855404
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.099887
      14.372579
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.966329
      18.389824
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      0.111226
      -0.100083
      0.0
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.504393
      6.975095
    
  

5 rows × 41 columns

add dummy feature



In [4]:

    
# pivot_train_canal = pd.get_dummies(pivot_train_categorical_useful_train['Canal_ID'])



In [5]:

    
# pivot_train_categorical_useful_train = pivot_train_categorical_useful_train.join(pivot_train_canal)
# pivot_train_categorical_useful_train.head()









    Out[5]:






  
    
      
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Canal_ID
      agen_for_log_de
      ruta_for_log_de
      ...
      LR_prod_corr
      1
      2
      4
      5
      6
      7
      8
      9
      11
    
  
  
    
      0
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2
      3.489749
      3.208232
      ...
      8.232495
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2
      3.489749
      3.208232
      ...
      6.855404
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2
      3.936934
      3.095841
      ...
      14.372579
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2
      3.936934
      3.095841
      ...
      18.389824
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2
      3.489749
      3.208232
      ...
      6.975095
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 46 columns

add product feature



In [2]:

    
%ls









    



0_prepocess_data.ipynb*            data_process.py*
1_xgboost.ipynb*                   origin/
agencia_for_cliente_producto.csv*  pivot_test.pickle*
baseline.ipynb*                    pivot_train_categorical_useful_time_LR.csv*
bimbo_perceptron/                  pivot_train_with_nan.pickle*
bimbo.py*                          preprocessed_products.csv*
bimbo.pyc*                         rnn.py*
canal_for_cliente_producto.csv*    ruta_for_cliente_producto.csv*
data_helper.py*                    train.py*
data_helper.pyc*



In [4]:

    
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)
pre_product.head()


pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')

pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
                                                         pre_product[['ID','weight','weight_per_piece']],
                                                         left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()









    Out[4]:






  
    
      
      ID
      product_name
      brand
      weight
      pieces
      weight_per_piece
      has_choco
      has_vanilla
      has_multigrain
    
  
  
    
      1
      0
      NO IDENTIFICADO 0
      IDENTIFICADO
      0.0
      0.0
      0
      False
      False
      False
    
    
      2
      9
      Capuccino Moka 750g NES 9
      NES
      750.0
      1.0
      1
      False
      False
      False
    
    
      3
      41
      Bimbollos Ext sAjonjoli 6p 480g BIM 41
      BIM
      480.0
      6.0
      80
      False
      False
      False
    
    
      4
      53
      Burritos Sincro 170g CU LON 53
      LON
      170.0
      1.0
      17-
      False
      False
      False
    
    
      5
      72
      Div Tira Mini Doradita 4p 45g TR 72
      TR
      45.0
      4.0
      11.25
      False
      False
      False



In [15]:



In [18]:

    
pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
                                                         pre_product[['ID','weight','weight_per_piece']],
                                                         left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()









    Out[18]:






  
    
      
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      ID
      weight
      weight_per_piece
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      1.775590
      8.232495
      1182
      210.0
      210.0
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      1.478578
      6.855404
      4767
      250.0
      NaN
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      3.099887
      14.372579
      30235
      600.0
      75.0
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      3.966329
      18.389824
      30314
      360.0
      NaN
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.504393
      6.975095
      31393
      640.0
      NaN
    
  

5 rows × 44 columns



In [50]:

    
pivot_train_categorical_useful_time_LR_weight.to_csv('pivot_train_categorical_useful_time_LR_weight.csv')



In [2]:

    
pivot_train_categorical_useful_time_LR_weight = pd.read_csv('pivot_train_categorical_useful_time_LR_weight.csv',index_col = 0)
pivot_train_categorical_useful_time_LR_weight.head()









    Out[2]:






  
    
      
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t2_min_t4
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      ID
      weight
      weight_per_piece
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      1.775590
      8.232495
      1182
      210.0
      210.0
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      1.478578
      6.855404
      4767
      250.0
      NaN
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      3.099887
      14.372579
      30235
      600.0
      75.0
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      3.966329
      18.389824
      30314
      360.0
      NaN
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      -0.060625
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.504393
      6.975095
      31393
      640.0
      NaN
    
  

5 rows × 44 columns

add town feature



In [22]:

    
%cd '/media/siyuan/0009E198000CD19B/bimbo/origin'
%ls









    



/media/siyuan/0009E198000CD19B/bimbo/origin
cliente_tabla.csv*  producto_tabla.csv*  test.csv*  town_state.csv*  train.csv*



In [23]:

    
cliente_tabla = pd.read_csv('cliente_tabla.csv')
town_state = pd.read_csv('town_state.csv')

town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)

train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]

cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )

cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)

cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
                                        cliente_per_town_count,on = 'town_id',how = 'left')

pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
                                                              cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
                                                              on = ['Cliente_ID','Producto_ID'],how = 'left')



In [24]:

    
cliente_tabla.head()









    Out[24]:






  
    
      
      Cliente_ID
      NombreCliente
    
  
  
    
      0
      0
      SIN NOMBRE
    
    
      1
      1
      OXXO XINANTECATL
    
    
      2
      2
      SIN NOMBRE
    
    
      3
      3
      EL MORENO
    
    
      4
      4
      SDN SER  DE ALIM  CUERPO SA CIA  DE INT



In [25]:

    
town_state.head()









    Out[25]:






  
    
      
      Agencia_ID
      Town
      State
    
  
  
    
      0
      1110
      2008 AG. LAGO FILT
      MÉXICO, D.F.
    
    
      1
      1111
      2002 AG. AZCAPOTZALCO
      MÉXICO, D.F.
    
    
      2
      1112
      2004 AG. CUAUTITLAN
      ESTADO DE MÉXICO
    
    
      3
      1113
      2008 AG. LAGO FILT
      MÉXICO, D.F.
    
    
      4
      1114
      2029 AG.IZTAPALAPA 2
      MÉXICO, D.F.



In [26]:

    
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)



In [27]:

    
town_state.head()









    Out[27]:






  
    
      
      Agencia_ID
      Town
      State
      town_id
    
  
  
    
      0
      1110
      2008 AG. LAGO FILT
      MÉXICO, D.F.
      2008
    
    
      1
      1111
      2002 AG. AZCAPOTZALCO
      MÉXICO, D.F.
      2002
    
    
      2
      1112
      2004 AG. CUAUTITLAN
      ESTADO DE MÉXICO
      2004
    
    
      3
      1113
      2008 AG. LAGO FILT
      MÉXICO, D.F.
      2008
    
    
      4
      1114
      2029 AG.IZTAPALAPA 2
      MÉXICO, D.F.
      2029



In [13]:

    
pivot_train_categorical_useful_time_LR_weight.columns.values









    Out[13]:





array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'ID', 'weight', 'weight_per_piece'], dtype=object)



In [34]:

    
train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]



In [35]:

    
cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )



In [36]:

    
cliente_per_town.head()









    Out[36]:






  
    
      
      Cliente_ID
      Producto_ID
      Agencia_ID
      NombreCliente
      town_id
    
  
  
    
      0
      26
      1182
      2061
      BODEGA COMERCIAL MEXICANA TOLUCA
      2175
    
    
      1
      26
      4767
      2061
      BODEGA COMERCIAL MEXICANA TOLUCA
      2175
    
    
      2
      26
      31393
      2061
      BODEGA COMERCIAL MEXICANA TOLUCA
      2175
    
    
      3
      26
      31690
      2061
      BODEGA COMERCIAL MEXICANA TOLUCA
      2175
    
    
      4
      26
      32953
      2061
      BODEGA COMERCIAL MEXICANA TOLUCA
      2175



In [37]:

    
cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)



In [38]:

    
cliente_per_town_count.head()









    Out[38]:






  
    
      
      town_id
      NombreCliente
    
  
  
    
      0
      2001
      2.74188
    
    
      1
      2002
      1.42954
    
    
      2
      2003
      1.74497
    
    
      3
      2004
      1.22725
    
    
      4
      2007
      1.97018



In [39]:

    
cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
                                        cliente_per_town_count,on = 'town_id',how = 'left')
cliente_per_town_count_final.head()









    Out[39]:






  
    
      
      Cliente_ID
      Producto_ID
      Agencia_ID
      town_id
      NombreCliente
    
  
  
    
      0
      26
      1182
      2061
      2175
      0.34498
    
    
      1
      26
      4767
      2061
      2175
      0.34498
    
    
      2
      26
      31393
      2061
      2175
      0.34498
    
    
      3
      26
      31690
      2061
      2175
      0.34498
    
    
      4
      26
      32953
      2061
      2175
      0.34498



In [42]:

    
pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
                                                              cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
                                                              on = ['Cliente_ID','Producto_ID'],how = 'left')
pivot_train_categorical_useful_time_LR_weight_town.head()









    Out[42]:






  
    
      
      Cliente_ID
      Producto_ID
      Sem3
      Sem4
      Sem5
      Sem6
      Sem7
      Sem8
      Sem9
      Agencia_ID
      ...
      t2_min_t5
      t3_min_t4
      t3_min_t5
      t4_min_t5
      LR_prod
      LR_prod_corr
      ID
      weight
      weight_per_piece
      NombreCliente
    
  
  
    
      0
      26
      1182
      3.688879
      NaN
      NaN
      NaN
      0.000000
      0.000000
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      1.775590
      8.232495
      1182
      210.0
      210.0
      0.34498
    
    
      1
      26
      4767
      3.761200
      NaN
      NaN
      NaN
      3.761200
      3.761200
      NaN
      2061
      ...
      NaN
      NaN
      NaN
      NaN
      1.478578
      6.855404
      4767
      250.0
      NaN
      0.34498
    
    
      2
      26
      30235
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.574711
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      3.099887
      14.372579
      30235
      600.0
      75.0
      0.00734
    
    
      3
      26
      30314
      NaN
      NaN
      NaN
      NaN
      3.891820
      NaN
      NaN
      2655
      ...
      NaN
      NaN
      NaN
      NaN
      3.966329
      18.389824
      30314
      360.0
      NaN
      0.00734
    
    
      4
      26
      31393
      3.044522
      2.833213
      2.772589
      2.772589
      2.944439
      3.135494
      2.639057
      2061
      ...
      -0.271934
      -0.060625
      -0.271934
      -0.211309
      1.504393
      6.975095
      31393
      640.0
      NaN
      0.34498
    
  

5 rows × 45 columns



In [43]:

    
pivot_train_categorical_useful_time_LR_weight_town.columns.values









    Out[43]:





array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'ID', 'weight', 'weight_per_piece', 'NombreCliente'], dtype=object)

begin xgboost training



In [7]:

    
train_pivot_xgb_time1.columns.values









    Out[7]:





array(['Agencia_ID', 'Canal_ID', 'Cliente_ID', 'LR_prod', 'LR_prod_corr',
       'NombreCliente', 'Producto_ID', 'Ruta_SAK', 'Sem3_cum', 'Sem4_cum',
       'Sem5_cum', 'Sem6_cum', 'Sem7_cum', 'Sem8_cum',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't1_min_t2', 't1_min_t3', 't1_min_t4',
       't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5', 't3_min_t4',
       't3_min_t5', 't4_min_t5', 't_min_1', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 'target'], dtype=object)



In [8]:

    
train_pivot_xgb_time1 = train_pivot_xgb_time1.drop(['Cliente_ID','Producto_ID','Agencia_ID',
                                                                            'Ruta_SAK','Canal_ID'],axis = 1)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-bf85e99ba623> in <module>()
      1 train_pivot_xgb_time1 = train_pivot_xgb_time1.drop(['Cliente_ID','Producto_ID','Agencia_ID',
----> 2                                                                             'Ruta_SAK','ID','Canal_ID'],axis = 1)

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in drop(self, labels, axis, level, inplace, errors)
   1875                 new_axis = axis.drop(labels, level=level, errors=errors)
   1876             else:
-> 1877                 new_axis = axis.drop(labels, errors=errors)
   1878             dropped = self.reindex(**{axis_name: new_axis})
   1879             try:

/usr/local/lib/python2.7/dist-packages/pandas/indexes/base.pyc in drop(self, labels, errors)
   3049             if errors != 'ignore':
   3050                 raise ValueError('labels %s not contained in axis' %
-> 3051                                  labels[mask])
   3052             indexer = indexer[~mask]
   3053         return self.delete(indexer)

ValueError: labels ['ID'] not contained in axis



In [45]:

    
pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem8'].notnull()]
# pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem9'].notnull()]



In [46]:

    
pivot_train_categorical_useful_train_time_no_nan_sample = pivot_train_categorical_useful_train_time_no_nan.sample(1000000)



In [47]:

    
train_feature = pivot_train_categorical_useful_train_time_no_nan_sample.drop(['Sem8','Sem9'],axis = 1)
train_label = pivot_train_categorical_useful_train_time_no_nan_sample[['Sem8','Sem9']]



In [9]:

    
#seperate train and test data
# datasource: sparse_week_Agencia_Canal_Ruta_normalized_csr label:train_label
%time train_set, valid_set, train_labels, valid_labels = train_test_split(train_feature,\
                                                                                  train_label, test_size=0.10)









    



CPU times: user 736 ms, sys: 36 ms, total: 772 ms
Wall time: 891 ms



In [48]:

    
# dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)
dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)



In [49]:

    
param = {'booster':'gbtree',
         'nthread': 7,
         'max_depth':6, 
         'eta':0.2,
         'silent':0,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}
# param = {'eta':0.1, 'eval_metric':'rmse','nthread': 8}


# evallist  = [(dvalid,'eval'), (dtrain,'train')]



In [ ]:

    
num_round = 1000
# plst = param.items()

# bst = xgb.train( plst, dtrain, num_round, evallist )
cvresult = xgb.cv(param, dtrain, num_round, nfold=5,show_progress=True,show_stdv=False,
                        seed = 0, early_stopping_rounds=10)
print(cvresult.tail())

for 1 week later

cv rmse 0.451181 with dummy canal, time regr,
cv rmse 0.450972 without dummy canal, time regr,
cv rmse 0.4485676 without dummy canal, time regr, producto info
cv rmse 0.4487434 without dummy canal, time regr, producto info, cliente_per_town

for 2 week later

cv rmse 0.4513236 without dummy canal, time regr, producto info



In [51]:

    
# xgb.plot_importance(cvresult)



In [ ]:

	Semana	Agencia_ID	Canal_ID	Ruta_SAK	Cliente_ID	Producto_ID	log_demand
0	3	1110	7	3301	15766	1212	1.386294
1	3	1110	7	3301	15766	1216	1.609438
2	3	1110	7	3301	15766	1238	1.609438
3	3	1110	7	3301	15766	1240	1.609438
4	3	1110	7	3301	15766	1242	1.386294

Semana	Cliente_ID	Producto_ID	Sem3	Sem4	Sem5	Sem6	Sem7	Sem8	Sem9	Agencia_ID	Canal_ID	Ruta_SAK
0	26	1182	3.688879	NaN	NaN	NaN	0.000000	0.000000	NaN	2061	2	7212
1	26	4767	3.761200	NaN	NaN	NaN	3.761200	3.761200	NaN	2061	2	7212
2	26	30235	NaN	NaN	NaN	NaN	NaN	NaN	4.574711	2655	2	4189
3	26	30314	NaN	NaN	NaN	NaN	3.891820	NaN	NaN	2655	2	4189
4	26	31393	3.044522	2.833213	2.772589	2.772589	2.944439	3.135494	2.639057	2061	2	7212

Semana	Cliente_ID	Producto_ID	Sem3	Sem4	Sem5	Sem6	Sem7	Sem8	Sem9	id	sem10_sem11	Agencia_ID	Canal_ID	Ruta_SAK
0	26	31518	NaN	2.397895	NaN	NaN	NaN	NaN	NaN	1569352	10	2655	2	4189
1	26	34206	4.795791	4.406719	3.76120	4.248495	4.574711	4.499810	4.682131	1547831	11	2061	2	7212
2	26	34210	2.890372	3.761200	3.78419	3.496508	3.433987	3.688879	3.688879	6667200	10	2061	2	7212
3	26	34785	3.091042	NaN	NaN	2.833213	3.332205	2.564949	3.218876	1592616	10	2061	2	7212
4	26	34785	3.091042	NaN	NaN	2.833213	3.332205	2.564949	3.218876	6825659	11	2061	2	7212

	id	Semana	Agencia_ID	Canal_ID	Ruta_SAK	Cliente_ID	Producto_ID
0	0	11	4037	1	2209	4639078	35305
1	1	11	2237	1	1226	4705135	1238
2	2	10	2045	1	2831	4549769	32940
3	3	11	1227	1	4448	4717855	43066
4	4	11	1219	1	1130	966351	1277

Semana	Cliente_ID	Producto_ID	Sem3	Sem4	Sem5	Sem6	Sem7	Sem8	Sem9	id	Semana	Agencia_ID	Canal_ID	Ruta_SAK
0	26.0	31518.0	NaN	2.397895	NaN	NaN	NaN	NaN	NaN	1569352	10	2655	2	4189
1	26.0	34206.0	4.795791	4.406719	3.76120	4.248495	4.574711	4.499810	4.682131	1547831	11	2061	2	7212
2	26.0	34210.0	2.890372	3.761200	3.78419	3.496508	3.433987	3.688879	3.688879	6667200	10	2061	2	7212
3	26.0	34785.0	3.091042	NaN	NaN	2.833213	3.332205	2.564949	3.218876	1592616	10	2061	2	7212
4	26.0	34785.0	3.091042	NaN	NaN	2.833213	3.332205	2.564949	3.218876	6825659	11	2061	2	7212

Semana	Cliente_ID	Producto_ID	id	sem10_sem11	Agencia_ID	Canal_ID	Ruta_SAK	ruta_freq	clien_freq	agen_freq	prod_freq	agen_for_log_de	ruta_for_log_de	cliente_for_log_de	producto_for_log_de	agen_ruta_for_log_de	agen_cliente_for_log_de	agen_producto_for_log_de	ruta_cliente_for_log_de	ruta_producto_for_log_de	cliente_producto_for_log_de	cliente_for_log_sum	cliente_producto_agen_for_log_sum	corr	t_min_6	t_min_2	t_min_3	t_min_4	t_min_5	t2_min_t6	t3_min_t6	t4_min_t6	t5_min_t6	t2_min_t3	t2_min_t4	t2_min_t5	t3_min_t4	t3_min_t5	t4_min_t5	LR_prod	LR_prod_corr	t_m_6_cum	t_m_5_cum	t_m_4_cum	t_m_3_cum	t_m_2_cum	NombreCliente	weight	weight_per_piece	pieces
0	26.0	34206.0	1547831.0	11.0	2061.0	2.0	7212.0	713.0	166.0	12208.0	91578.0	3.523074	3.222417	2.890955	1.724493	2.835826	2.835826	4.809507	2.835826	4.642291	4.353270	479.898621	4.353270	3.596229	3.761200	4.682131	4.499810	4.574711	4.248495	0.920931	0.738610	0.813511	0.487295	0.182322	0.107420	0.433636	-0.074901	0.251314	0.326216	1.536319	7.372772	3.761200	8.009695	12.584406	17.084215	21.766348	5954.0	340.0	42.500	8.0
1	26.0	34785.0	6825659.0	11.0	2061.0	2.0	7212.0	713.0	166.0	12208.0	10780.0	3.523074	3.222417	2.890955	3.987424	2.835826	2.835826	3.836502	2.835826	3.786542	2.987311	479.898621	2.987311	8.315309	0.000000	3.218876	2.564949	3.332205	2.833213	3.218876	2.564949	3.332205	2.833213	0.653926	-0.113329	0.385662	-0.767255	-0.268264	0.498991	3.918295	18.803841	0.000000	2.833213	6.165418	8.730368	11.949243	5954.0	480.0	NaN	NaN
2	26.0	35148.0	5853787.0	11.0	2061.0	2.0	7212.0	713.0	166.0	12208.0	11047.0	3.523074	3.222417	2.890955	2.694399	2.835826	2.835826	3.035542	2.835826	2.942251	2.600500	479.898621	2.600500	5.618855	1.791759	2.995732	3.044523	2.397895	2.772589	1.203973	1.252763	0.606136	0.980829	-0.048790	0.597837	0.223144	0.646627	0.271934	-0.374693	2.439828	11.708699	1.791759	4.564348	6.962244	10.006766	13.002499	5954.0	567.0	NaN	NaN
3	26.0	43335.0	2316053.0	11.0	2061.0	2.0	7212.0	713.0	166.0	12208.0	4912.0	3.523074	3.222417	2.890955	2.392465	2.835826	2.835826	2.128167	2.835826	1.912965	0.693147	479.898621	0.693147	4.989207	0.000000	1.386294	0.000000	0.000000	0.000000	1.386294	0.000000	0.000000	0.000000	1.386294	1.386294	1.386294	0.000000	0.000000	0.000000	2.195412	10.535754	0.000000	0.000000	0.000000	0.000000	1.386294	5954.0	435.0	54.375	8.0
4	26.0	43342.0	900676.0	11.0	2061.0	2.0	7212.0	713.0	166.0	12208.0	10614.0	3.523074	3.222417	2.890955	2.745177	2.835826	2.835826	2.862706	2.835826	2.438047	1.978298	479.898621	1.978298	5.724748	1.098612	3.044523	1.791759	0.000000	0.000000	1.945910	0.693147	-1.098612	-1.098612	1.252763	3.044523	3.044523	1.791759	1.791759	0.000000	2.509016	12.040730	1.098612	1.098612	1.098612	2.890372	5.934894	5954.0	248.0	62.000	4.0

Semana	Cliente_ID	Producto_ID	id	sem10_sem11	Agencia_ID	Canal_ID	Ruta_SAK	ruta_freq	clien_freq	agen_freq	prod_freq	agen_for_log_de	ruta_for_log_de	cliente_for_log_de	producto_for_log_de	agen_ruta_for_log_de	agen_cliente_for_log_de	agen_producto_for_log_de	ruta_cliente_for_log_de	ruta_producto_for_log_de	cliente_producto_for_log_de	cliente_for_log_sum	cliente_producto_agen_for_log_sum	corr	t_min_6	t_min_2	t_min_3	t_min_4	t_min_5	t2_min_t6	t3_min_t6	t4_min_t6	t5_min_t6	t2_min_t3	t2_min_t4	t2_min_t5	t3_min_t4	t3_min_t5	t4_min_t5	LR_prod	LR_prod_corr	t_m_6_cum	t_m_5_cum	t_m_4_cum	t_m_3_cum	t_m_2_cum	NombreCliente	weight	weight_per_piece	pieces
0	26.0	31518.0	1569352.0	10.0	2655.0	2.0	4189.0	10.0	158.0	3861.0	169.0	3.951774	3.165635	2.83547	2.493198	3.165635	3.033128	2.475491	3.165635	2.397895	2.397895	448.004242	2.397895	5.099486	2.397895	0.000000	0.000000	0.000000	0.000000	-2.397895	-2.397895	-2.397895	-2.397895	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.301235	-1.349546	2.397895	2.397895	2.397895	2.397895	2.397895	131.0	691.0	NaN	NaN
1	26.0	34210.0	6667200.0	10.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	10746.0	3.509364	3.211487	2.83547	4.374220	2.805172	2.805172	4.490424	2.805172	3.720000	3.632953	448.004242	3.632953	8.946852	3.761200	3.688879	3.433987	3.496508	3.784190	-0.072321	-0.327213	-0.264693	0.022990	0.254892	0.192372	-0.095310	-0.062520	-0.350202	-0.287682	4.428435	19.839575	3.761200	7.545390	11.041898	14.475884	18.164764	6027.0	740.0	NaN	NaN
2	26.0	34785.0	1592616.0	10.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	10736.0	3.509364	3.211487	2.83547	3.987994	2.805172	2.805172	3.839655	2.805172	3.789903	2.910122	448.004242	2.910122	8.156881	0.000000	2.564949	3.332205	2.833213	0.000000	2.564949	3.332205	2.833213	0.000000	-0.767255	-0.268264	2.564949	0.498991	3.332205	2.833213	3.992911	17.888412	0.000000	0.000000	2.833213	6.165418	8.730368	6027.0	480.0	NaN	NaN
3	26.0	34786.0	3909690.0	10.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	13169.0	3.509364	3.211487	2.83547	4.535940	2.805172	2.805172	4.738372	2.805172	4.336529	4.143021	448.004242	4.143021	9.277627	4.158883	4.094345	4.317488	3.713572	4.430817	-0.064539	0.158605	-0.445311	0.271934	-0.223144	0.380773	-0.336472	0.603916	-0.113329	-0.717245	4.513783	20.221937	4.158883	8.589700	12.303272	16.620760	20.715105	6027.0	680.0	NaN	NaN
4	26.0	35142.0	3659672.0	10.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	10624.0	3.509364	3.211487	2.83547	3.268587	2.805172	2.805172	4.077493	2.805172	3.415331	3.320460	448.004242	3.320460	6.685434	1.945910	3.828641	3.688879	3.555348	3.583519	1.882731	1.742969	1.609438	1.637609	0.139762	0.273293	0.245122	0.133531	0.105361	-0.028171	2.966826	13.291507	1.945910	5.529429	9.084777	12.773657	16.602299	6027.0	567.0	NaN	NaN

Semana	Cliente_ID	Producto_ID	Agencia_ID	Canal_ID	Ruta_SAK	ruta_freq	clien_freq	agen_freq	prod_freq	agen_for_log_de	ruta_for_log_de	cliente_for_log_de	producto_for_log_de	agen_ruta_for_log_de	agen_cliente_for_log_de	agen_producto_for_log_de	ruta_cliente_for_log_de	ruta_producto_for_log_de	cliente_producto_for_log_de	cliente_for_log_sum	cliente_producto_agen_for_log_sum	corr	t_min_1	t_min_2	t_min_3	t_min_4	t_min_5	t1_min_t2	t1_min_t3	t1_min_t4	t1_min_t5	t2_min_t3	t2_min_t4	t2_min_t5	t3_min_t4	t3_min_t5	t4_min_t5	LR_prod	LR_prod_corr	target	t_m_5_cum	t_m_4_cum	t_m_3_cum	t_m_2_cum	t_m_1_cum	NombreCliente	weight	weight_per_piece	pieces
0	26.0	30235.0	2655.0	2.0	4189.0	10.0	158.0	3861.0	2062.0	3.951774	3.165635	2.83547	3.122565	3.165635	3.033128	3.516439	3.165635	NaN	NaN	448.004242	NaN	6.386767	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.314400	5.888566	4.574711	0.000000	0.000000	0.000000	0.000000	0.000000	233.0	600.0	75.00	8.0
1	26.0	31393.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	16321.0	3.509364	3.211487	2.83547	1.557049	2.805172	2.805172	3.350600	2.805172	3.125611	2.891665	448.004242	2.891665	3.184724	3.135494	2.944439	2.772589	2.772589	2.833213	0.191055	0.362906	0.362906	0.302281	0.171850	0.171850	0.111226	0.000000	-0.060625	-0.060625	1.269118	5.685703	2.639057	2.833213	5.605802	8.378390	11.322830	14.458324	18989.0	640.0	NaN	NaN
2	26.0	32962.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	7375.0	3.509364	3.211487	2.83547	2.643195	2.805172	2.805172	2.410218	2.805172	2.225570	NaN	448.004242	NaN	5.406284	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.042073	9.148576	2.397895	0.000000	0.000000	0.000000	0.000000	0.000000	18989.0	180.0	30.00	6.0
3	26.0	34204.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	17155.0	3.509364	3.211487	2.83547	2.691697	2.805172	2.805172	3.825037	2.805172	3.796168	3.647111	448.004242	3.647111	5.505487	3.828641	3.610918	3.555348	3.806663	3.433987	0.217723	0.273293	0.021979	0.394654	0.055570	-0.195745	0.176931	-0.251314	0.121361	0.372675	2.526538	11.318999	3.784190	3.433987	7.240650	10.795998	14.406916	18.235558	18989.0	450.0	56.25	8.0
4	26.0	34206.0	2061.0	2.0	7212.0	658.0	158.0	12429.0	80184.0	3.509364	3.211487	2.83547	1.734170	2.805172	2.805172	4.830257	2.805172	4.632751	4.298187	448.004242	4.298187	3.547001	4.499810	4.574711	4.248495	3.761200	4.406719	-0.074901	0.251314	0.738610	0.093090	0.326216	0.813511	0.167992	0.487295	-0.158224	-0.645519	0.990179	4.436046	4.682131	4.406719	8.167919	12.416414	16.991125	21.490934	18989.0	340.0	42.50	8.0

Semana	Cliente_ID	Producto_ID	Agencia_ID	Canal_ID	Ruta_SAK	ruta_freq	clien_freq	agen_freq	prod_freq	agen_for_log_de	ruta_for_log_de	cliente_for_log_de	producto_for_log_de	agen_ruta_for_log_de	agen_cliente_for_log_de	agen_producto_for_log_de	ruta_cliente_for_log_de	ruta_producto_for_log_de	cliente_producto_for_log_de	cliente_for_log_sum	cliente_producto_agen_for_log_sum	corr	t_min_1	t_min_2	t_min_3	t_min_4	t_min_5	t1_min_t2	t1_min_t3	t1_min_t4	t1_min_t5	t2_min_t3	t2_min_t4	t2_min_t5	t3_min_t4	t3_min_t5	t4_min_t5	LR_prod	LR_prod_corr	target	t_m_5_cum	t_m_4_cum	t_m_3_cum	t_m_2_cum	t_m_1_cum	NombreCliente	weight	weight_per_piece	pieces
0	26.0	1182.0	2061.0	2.0	7212.0	548.0	160.0	12574.0	411493.0	3.489749	3.208231	2.897803	1.766994	2.877003	2.877003	4.370318	2.877003	3.694196	1.844440	463.648468	1.844440	3.693589	0.000000	0.000000	0.000000	0.000000	3.688879	0.000000	0.000000	0.000000	-3.688879	0.000000	0.000000	-3.688879	0.000000	-3.688879	-3.688879	1.127138	5.225957	0.000000	3.688879	3.688879	3.688879	3.688879	3.688879	18434.0	210.0	210.00	1.0
1	26.0	4767.0	2061.0	2.0	7212.0	548.0	160.0	12574.0	112080.0	3.489749	3.208231	2.897803	1.482496	2.877003	2.877003	4.253512	2.877003	4.164880	3.761200	463.648468	3.761200	3.098896	3.761200	0.000000	0.000000	0.000000	3.761200	3.761200	3.761200	3.761200	0.000000	0.000000	0.000000	-3.761200	0.000000	-3.761200	-3.761200	0.768180	3.561656	3.761200	3.761200	3.761200	3.761200	3.761200	7.522400	18434.0	250.0	NaN	NaN
2	26.0	31393.0	2061.0	2.0	7212.0	548.0	160.0	12574.0	15736.0	3.489749	3.208231	2.897803	1.563950	2.877003	2.877003	3.333444	2.877003	3.013416	2.873471	463.648468	2.873471	3.269161	2.944439	2.772589	2.772589	2.833213	3.044523	0.171850	0.171850	0.111226	-0.100083	0.000000	-0.060625	-0.271934	-0.060625	-0.271934	-0.211309	1.298646	6.021154	3.135494	3.044523	5.877736	8.650325	11.422914	14.367352	18434.0	640.0	NaN	NaN
3	26.0	34204.0	2061.0	2.0	7212.0	548.0	160.0	12574.0	17261.0	3.489749	3.208231	2.897803	2.674463	2.877003	2.877003	3.848593	2.877003	3.819687	3.638221	463.648468	3.638221	5.590491	3.610918	3.555348	3.806663	3.433987	3.784190	0.055570	-0.195745	0.176931	-0.173272	-0.251314	0.121361	-0.228842	0.372675	0.022473	-0.350202	2.560121	11.869964	3.828641	3.784190	7.218177	11.024839	14.580188	18.191105	18434.0	450.0	56.25	8.0
4	26.0	34206.0	2061.0	2.0	7212.0	548.0	160.0	12574.0	87803.0	3.489749	3.208231	2.897803	1.730373	2.877003	2.877003	4.836590	2.877003	4.628871	4.357383	463.648468	4.357383	3.617039	4.574711	4.248495	3.761200	4.406719	4.795791	0.326216	0.813511	0.167992	-0.221080	0.487295	-0.158224	-0.547295	-0.645519	-1.034590	-0.389071	1.379329	6.395236	4.499810	4.795791	9.202510	12.963710	17.212206	21.786917	18434.0	340.0	42.50	8.0

	Agencia_ID	Canal_ID	Cliente_ID	LR_prod	LR_prod_corr	NombreCliente	Producto_ID	Ruta_SAK	agen_cliente_for_log_de	agen_for_log_de	...	t_m_3_cum	t_m_4_cum	t_m_5_cum	t_min_2	t_min_3	t_min_4	t_min_5	target	weight	weight_per_piece
0	2061	2	26	2.001190	7.293554	18434	1182	7212	2.852285	3.491654	...	NaN	NaN	3.688879	NaN	NaN	NaN	3.688879	0.000000	210.0	210.00
1	2061	2	26	1.839411	6.703932	18434	4767	7212	2.852285	3.491654	...	NaN	NaN	3.761200	NaN	NaN	NaN	3.761200	3.761200	250.0	NaN
2	2061	2	26	1.911283	6.965878	18434	31393	7212	2.852285	3.491654	...	8.650325	5.877736	3.044522	2.772589	2.772589	2.833213	3.044522	3.135494	640.0	NaN
3	2061	2	26	3.113374	11.347029	18434	34204	7212	2.852285	3.491654	...	11.024839	7.218177	3.784190	3.555348	3.806662	3.433987	3.784190	3.828641	450.0	56.25
4	2061	2	26	2.031231	7.403043	18434	34206	7212	2.852285	3.491654	...	12.963710	9.202510	4.795791	4.248495	3.761200	4.406719	4.795791	4.499810	340.0	42.50

	ID	product_name	brand	weight	pieces	weight_per_piece	has_choco	has_vanilla	has_multigrain
1	0	NO IDENTIFICADO 0	IDENTIFICADO	0.0	0.0	0	False	False	False
2	9	Capuccino Moka 750g NES 9	NES	750.0	1.0	1	False	False	False
3	41	Bimbollos Ext sAjonjoli 6p 480g BIM 41	BIM	480.0	6.0	80	False	False	False
4	53	Burritos Sincro 170g CU LON 53	LON	170.0	1.0	17-	False	False	False
5	72	Div Tira Mini Doradita 4p 45g TR 72	TR	45.0	4.0	11.25	False	False	False

	Cliente_ID	NombreCliente
0	0	SIN NOMBRE
1	1	OXXO XINANTECATL
2	2	SIN NOMBRE
3	3	EL MORENO
4	4	SDN SER DE ALIM CUERPO SA CIA DE INT

	Agencia_ID	Town	State
0	1110	2008 AG. LAGO FILT	MÉXICO, D.F.
1	1111	2002 AG. AZCAPOTZALCO	MÉXICO, D.F.
2	1112	2004 AG. CUAUTITLAN	ESTADO DE MÉXICO
3	1113	2008 AG. LAGO FILT	MÉXICO, D.F.
4	1114	2029 AG.IZTAPALAPA 2	MÉXICO, D.F.

	Cliente_ID	Producto_ID	Agencia_ID	NombreCliente	town_id
0	26	1182	2061	BODEGA COMERCIAL MEXICANA TOLUCA	2175
1	26	4767	2061	BODEGA COMERCIAL MEXICANA TOLUCA	2175
2	26	31393	2061	BODEGA COMERCIAL MEXICANA TOLUCA	2175
3	26	31690	2061	BODEGA COMERCIAL MEXICANA TOLUCA	2175
4	26	32953	2061	BODEGA COMERCIAL MEXICANA TOLUCA	2175

	town_id	NombreCliente
0	2001	2.74188
1	2002	1.42954
2	2003	1.74497
3	2004	1.22725
4	2007	1.97018