In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc

In [2]:
%pwd


Out[2]:
u'/home/siyuan/Documents/bimbo_kaggle'

In [4]:
%ls


1_xgboost.ipynb                   submission_10.csv
3_prediction.ipynb                submission_10_new.csv
44fea_bst.model                   submission_11.csv
4_keras_nn.ipynb                  submission_11_new.csv
5_random_forest.ipynb             submission_nn_2.csv
6_random_forest.ipynb             submission_nn.csv
agencia_for_cliente_producto.csv  submission_nn_xgb
agen_freq_semana.pickle           submission_xgb_10.pickle
canal_for_cliente_producto.csv    submission_xgb_2.csv
clien_freq_semana.pickle          submission_xgb.csv
model_nn_10_after_l2reg.h5        submission_xgb_nn_10.pickle
model_nn_10.h5                    submission_xgb_with_nn.csv
model_nn_10_whole.h5              train_pivot_3456_to_8.csv
origin/                           train_pivot_45678_to_9_new.csv
pivot_test.pickle                 train_pivot_56789_to_10_new.pickle
pivot_train_with_nan.pickle       train_pivot_56789_to_10.pickle
pivot_train_with_zero.pickle      train_pivot_6789_to_11_new.pickle
preprocessed_products.csv         train_pivot_6789_to_11.pickle
prod_freq_semana.pickle           train_pivot_xgb_time1_44fea.csv
ruta_for_cliente_producto.csv     train_pivot_xgb_time1.csv
ruta_freq_semana.pickle           train_pivot_xgb_time2_38fea.csv
stack_train_nn_10.pickle          train_pivot_xgb_time2.csv

In [2]:
dtypes = {'Semana' : 'int32',
                              'Agencia_ID' :'int32',
                              'Canal_ID' : 'int32',
                              'Ruta_SAK' : 'int32',
                              'Cliente-ID' : 'int32',
                              'Producto_ID':'int32',
                              'Venta_hoy':'float32',
                              'Venta_uni_hoy': 'int32',
                              'Dev_uni_proxima':'int32',
                              'Dev_proxima':'float32',
                              'Demanda_uni_equil':'int32'}

In [3]:
train_dataset = pd.read_csv('origin/train.csv',
                            usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID','Demanda_uni_equil'],
                            dtype  = dtypes)
train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
train_dataset.drop(['Demanda_uni_equil'],axis = 1,inplace = True)
train_dataset.head()


Out[3]:
Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID log_demand
0 3 1110 7 3301 15766 1212 1.386294
1 3 1110 7 3301 15766 1216 1.609438
2 3 1110 7 3301 15766 1238 1.609438
3 3 1110 7 3301 15766 1240 1.609438
4 3 1110 7 3301 15766 1242 1.386294

In [5]:
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})
pivot_train.head()


Out[5]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID Canal_ID Ruta_SAK
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 2 7212
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 2 7212
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 2 4189
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 2 4189
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 2 7212

In [6]:
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.rename(columns = {'Semana':'sem10_sem11'},inplace = True)
pivot_test.shape


Out[6]:
(6999251, 14)

In [7]:
pivot_test.columns.values


Out[7]:
array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'id', 'sem10_sem11', 'Agencia_ID', 'Canal_ID',
       'Ruta_SAK'], dtype=object)

make the train_pivot, duplicate exist when index = ['Cliente','Producto']

for each cliente & producto, first find its most common Agencia_ID, Canal_ID, Ruta_SAK



In [29]:
agencia_for_cliente_producto = train_dataset[['Cliente_ID','Producto_ID'
                                              ,'Agencia_ID']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
canal_for_cliente_producto = train_dataset[['Cliente_ID',
                                            'Producto_ID','Canal_ID']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
ruta_for_cliente_producto = train_dataset[['Cliente_ID',
                                           'Producto_ID','Ruta_SAK']].groupby(['Cliente_ID',
                                                                        'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()

In [22]:
gc.collect()


Out[22]:
775

In [37]:
agencia_for_cliente_producto.to_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto.to_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto.to_pickle('ruta_for_cliente_producto.csv')

In [8]:
agencia_for_cliente_producto = pd.read_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto = pd.read_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto = pd.read_pickle('ruta_for_cliente_producto.csv')

In [6]:
# train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
pivot_train = pd.pivot_table(data= train_dataset[['Cliente_ID','Producto_ID','log_demand','Semana']],
                                        values='log_demand', index=['Cliente_ID','Producto_ID'],
                                        columns=['Semana'], aggfunc=np.mean,fill_value = 0).reset_index()

In [5]:
pivot_train.head()


Out[5]:
Semana Cliente_ID Producto_ID 3 4 5 6 7 8 9 Agencia_ID Canal_ID Ruta_SAK
0 26 1182 3.688879 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2061 2 7212
1 26 4767 3.761200 0.000000 0.000000 0.000000 3.761200 3.761200 0.000000 2061 2 7212
2 26 30235 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.574711 2655 2 4189
3 26 30314 0.000000 0.000000 0.000000 0.000000 3.891820 0.000000 0.000000 2655 2 4189
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 2 7212

In [9]:
pivot_train = pd.merge(left = pivot_train, right = agencia_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = canal_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = ruta_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])

In [11]:
pivot_train.to_pickle('pivot_train_with_zero.pickle')

In [4]:
pivot_train = pd.read_pickle('pivot_train_with_zero.pickle')

In [15]:
pivot_train.to_pickle('pivot_train_with_nan.pickle')

In [3]:
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')

In [15]:
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})

In [16]:
pivot_train.head()


Out[16]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID Canal_ID Ruta_SAK
0 26 1182 3.688879 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2061 2 7212
1 26 4767 3.761200 0.000000 0.000000 0.000000 3.761200 3.761200 0.000000 2061 2 7212
2 26 30235 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.574711 2655 2 4189
3 26 30314 0.000000 0.000000 0.000000 0.000000 3.891820 0.000000 0.000000 2655 2 4189
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 2 7212

In [8]:
pivot_train.columns.values


Out[8]:
array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK'], dtype=object)

make pivot table of test



In [7]:
test_dataset = pd.read_csv('origin/test.csv')
test_dataset.head()


Out[7]:
id Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID
0 0 11 4037 1 2209 4639078 35305
1 1 11 2237 1 1226 4705135 1238
2 2 10 2045 1 2831 4549769 32940
3 3 11 1227 1 4448 4717855 43066
4 4 11 1219 1 1130 966351 1277

In [7]:
test_dataset[test_dataset['Semana'] == 10].shape


Out[7]:
(3538385, 7)

In [8]:
test_dataset[test_dataset['Semana'] == 11].shape


Out[8]:
(3460866, 7)

In [74]:
pivot_test = pd.merge(left=pivot_train, right = test_dataset[['id','Cliente_ID','Producto_ID','Semana']],
                      on =['Cliente_ID','Producto_ID'],how = 'inner' )
pivot_test.head()


Out[74]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID Canal_ID Ruta_SAK id Semana
0 26 31518 NaN 2.397895 NaN NaN NaN NaN NaN 2655 2 4189 1569352 10
1 26 34206 4.795791 4.406719 3.76120 4.248495 4.574711 4.499810 4.682131 2061 2 7212 1547831 11
2 26 34210 2.890372 3.761200 3.78419 3.496508 3.433987 3.688879 3.688879 2061 2 7212 6667200 10
3 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 1592616 10
4 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 6825659 11

In [8]:
pivot_test_new = pd.merge(pivot_train[['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9']],right = test_dataset, on = ['Cliente_ID','Producto_ID'],how = 'right')

In [9]:
pivot_test_new.head()


Out[9]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 id Semana Agencia_ID Canal_ID Ruta_SAK
0 26.0 31518.0 NaN 2.397895 NaN NaN NaN NaN NaN 1569352 10 2655 2 4189
1 26.0 34206.0 4.795791 4.406719 3.76120 4.248495 4.574711 4.499810 4.682131 1547831 11 2061 2 7212
2 26.0 34210.0 2.890372 3.761200 3.78419 3.496508 3.433987 3.688879 3.688879 6667200 10 2061 2 7212
3 26.0 34785.0 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 1592616 10 2061 2 7212
4 26.0 34785.0 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 6825659 11 2061 2 7212

In [10]:
pivot_test_new.to_pickle('pivot_test.pickle')

In [75]:
pivot_test.to_pickle('pivot_test.pickle')

In [4]:
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.head()


Out[4]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID Canal_ID Ruta_SAK id Semana
0 26 31518 NaN 2.397895 NaN NaN NaN NaN NaN 2655 2 4189 1569352 10
1 26 34206 4.795791 4.406719 3.76120 4.248495 4.574711 4.499810 4.682131 2061 2 7212 1547831 11
2 26 34210 2.890372 3.761200 3.78419 3.496508 3.433987 3.688879 3.688879 2061 2 7212 6667200 10
3 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 1592616 10
4 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 6825659 11

groupby use Agencia_ID, Ruta_SAK, Cliente_ID, Producto_ID



In [52]:
train_dataset.head()


Out[52]:
Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID Demanda_uni_equil log_demand
0 3 1110 7 3301 15766 1212 3 1.386294
1 3 1110 7 3301 15766 1216 4 1.609438
2 3 1110 7 3301 15766 1238 4 1.609438
3 3 1110 7 3301 15766 1240 4 1.609438
4 3 1110 7 3301 15766 1242 3 1.386294

In [70]:
import itertools
col_list = ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
all_combine = itertools.combinations(col_list,2)

In [71]:
list_2element_combine = [list(tuple) for tuple in all_combine]

In [75]:
col_1elm_2elm = col_list + list_2element_combine
col_1elm_2elm


Out[75]:
['Agencia_ID',
 'Ruta_SAK',
 'Cliente_ID',
 'Producto_ID',
 ['Agencia_ID', 'Ruta_SAK'],
 ['Agencia_ID', 'Cliente_ID'],
 ['Agencia_ID', 'Producto_ID'],
 ['Ruta_SAK', 'Cliente_ID'],
 ['Ruta_SAK', 'Producto_ID'],
 ['Cliente_ID', 'Producto_ID']]

In [79]:
train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()

if predict week 8, use data from 3,4,5,6,7

if predict week 9, use data from 3,4,5,6,7



In [11]:
def categorical_useful(train_dataset,pivot_train):
#     if is_train:
#         train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()
#     elif is_train == False:
    train_dataset_test = train_dataset.copy()
        
    log_demand_by_agen = train_dataset_test[['Agencia_ID','log_demand']].groupby('Agencia_ID').mean().reset_index()
    log_demand_by_ruta = train_dataset_test[['Ruta_SAK','log_demand']].groupby('Ruta_SAK').mean().reset_index()
    log_demand_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').mean().reset_index()
    log_demand_by_producto = train_dataset_test[['Producto_ID','log_demand']].groupby('Producto_ID').mean().reset_index()
    
    log_demand_by_agen_ruta = train_dataset_test[['Agencia_ID', 'Ruta_SAK',
                                                  'log_demand']].groupby(['Agencia_ID', 'Ruta_SAK']).mean().reset_index()
    log_demand_by_agen_cliente = train_dataset_test[['Agencia_ID', 'Cliente_ID',
                                                     'log_demand']].groupby(['Agencia_ID', 'Cliente_ID']).mean().reset_index()
    log_demand_by_agen_producto = train_dataset_test[['Agencia_ID', 'Producto_ID',
                                                      'log_demand']].groupby(['Agencia_ID', 'Producto_ID']).mean().reset_index()
    log_demand_by_ruta_cliente = train_dataset_test[['Ruta_SAK', 'Cliente_ID',
                                                     'log_demand']].groupby(['Ruta_SAK', 'Cliente_ID']).mean().reset_index()
    log_demand_by_ruta_producto = train_dataset_test[['Ruta_SAK', 'Producto_ID',
                                                      'log_demand']].groupby(['Ruta_SAK', 'Producto_ID']).mean().reset_index()
    log_demand_by_cliente_producto = train_dataset_test[['Cliente_ID', 'Producto_ID',
                                                     'log_demand']].groupby(['Cliente_ID', 'Producto_ID']).mean().reset_index()
    
    log_demand_by_cliente_producto_agen = train_dataset_test[[
            'Cliente_ID','Producto_ID','Agencia_ID','log_demand']].groupby(['Cliente_ID',
                                                                            'Agencia_ID','Producto_ID']).mean().reset_index()
    
    log_sum_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').sum().reset_index()
    
    ruta_freq_semana = train_dataset[['Semana','Ruta_SAK']].groupby(['Ruta_SAK']).count().reset_index()
    clien_freq_semana = train_dataset[['Semana','Cliente_ID']].groupby(['Cliente_ID']).count().reset_index()
    agen_freq_semana = train_dataset[['Semana','Agencia_ID']].groupby(['Agencia_ID']).count().reset_index()
    prod_freq_semana = train_dataset[['Semana','Producto_ID']].groupby(['Producto_ID']).count().reset_index()
    
    
    pivot_train = pd.merge(left = pivot_train,right = ruta_freq_semana,
                           how = 'left', on = ['Ruta_SAK']).rename(columns={'Semana': 'ruta_freq'})
    pivot_train = pd.merge(left = pivot_train,right = clien_freq_semana,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'Semana': 'clien_freq'})
    pivot_train = pd.merge(left = pivot_train,right = agen_freq_semana,
                           how = 'left', on = ['Agencia_ID']).rename(columns={'Semana': 'agen_freq'})
    pivot_train = pd.merge(left = pivot_train,right = prod_freq_semana,
                           how = 'left', on = ['Producto_ID']).rename(columns={'Semana': 'prod_freq'})
    
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen,
                           how = 'left', on = ['Agencia_ID']).rename(columns={'log_demand': 'agen_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta,
                           how = 'left', on = ['Ruta_SAK']).rename(columns={'log_demand': 'ruta_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_cliente,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_producto,
                           how = 'left', on = ['Producto_ID']).rename(columns={'log_demand': 'producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_ruta,
                           how = 'left', on = ['Agencia_ID', 'Ruta_SAK']).rename(columns={'log_demand': 'agen_ruta_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_cliente,
                           how = 'left', on = ['Agencia_ID', 'Cliente_ID']).rename(columns={'log_demand': 'agen_cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_agen_producto,
                           how = 'left', on = ['Agencia_ID', 'Producto_ID']).rename(columns={'log_demand': 'agen_producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta_cliente,
                           how = 'left', on = ['Ruta_SAK', 'Cliente_ID']).rename(columns={'log_demand': 'ruta_cliente_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_ruta_producto,
                           how = 'left', on = ['Ruta_SAK', 'Producto_ID']).rename(columns={'log_demand': 'ruta_producto_for_log_de'})
    pivot_train = pd.merge(left = pivot_train,
                           right = log_demand_by_cliente_producto,
                           how = 'left', on = ['Cliente_ID', 'Producto_ID']).rename(columns={'log_demand': 'cliente_producto_for_log_de'})
    
    pivot_train = pd.merge(left = pivot_train,
                           right = log_sum_by_cliente,
                           how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_sum'})
    
    pivot_train = pd.merge(left = pivot_train,
                       right = log_demand_by_cliente_producto_agen,
                       how = 'left', on = ['Cliente_ID', 'Producto_ID',
                                           'Agencia_ID']).rename(columns={'log_demand': 'cliente_producto_agen_for_log_sum'})
    
    
    pivot_train['corr'] = pivot_train['producto_for_log_de'] * pivot_train['cliente_for_log_de'] / train_dataset_test['log_demand'].median()

    return pivot_train

In [12]:
def define_time_features(df, to_predict = 't_plus_1' , t_0 = 8):
    if(to_predict == 't_plus_1' ):
        df['t_min_1'] = df['Sem'+str(t_0-1)]
    if(to_predict == 't_plus_2' ):
        df['t_min_6'] = df['Sem'+str(t_0-6)]
      
    df['t_min_2'] = df['Sem'+str(t_0-2)]
    df['t_min_3'] = df['Sem'+str(t_0-3)]
    df['t_min_4'] = df['Sem'+str(t_0-4)]
    df['t_min_5'] = df['Sem'+str(t_0-5)]
    
    
    if(to_predict == 't_plus_1' ):
        df['t1_min_t2'] =  df['t_min_1'] - df['t_min_2']
        df['t1_min_t3'] =  df['t_min_1'] - df['t_min_3']
        df['t1_min_t4'] =  df['t_min_1'] - df['t_min_4']
        df['t1_min_t5'] =  df['t_min_1'] - df['t_min_5']
    
    if(to_predict == 't_plus_2' ):
        df['t2_min_t6'] =  df['t_min_2'] - df['t_min_6']
        df['t3_min_t6'] =  df['t_min_3'] - df['t_min_6']
        df['t4_min_t6'] =  df['t_min_4'] - df['t_min_6']
        df['t5_min_t6'] =  df['t_min_5'] - df['t_min_6']

    df['t2_min_t3'] =  df['t_min_2'] - df['t_min_3']
    df['t2_min_t4'] =  df['t_min_2'] - df['t_min_4']
    df['t2_min_t5'] =  df['t_min_2'] - df['t_min_5']

    df['t3_min_t4'] =  df['t_min_3'] - df['t_min_4']
    df['t3_min_t5'] =  df['t_min_3'] - df['t_min_5']

    df['t4_min_t5'] =  df['t_min_4'] - df['t_min_5']
        
    return df

In [13]:
def lin_regr(row, to_predict, t_0, semanas_numbers):
    row = row.copy()
    row.index = semanas_numbers
    row = row.dropna()
    if(len(row>2)):
        X = np.ones(shape=(len(row), 2))
        X[:,1] = row.index
        y = row.values
        regr = linear_model.LinearRegression()
        regr.fit(X, y)
        if(to_predict == 't_plus_1'):
            return regr.predict([[1,t_0+1]])[0]
        elif(to_predict == 't_plus_2'):
            return regr.predict([[1,t_0+2]])[0]
    else:
        return None

In [14]:
def lin_regr_features(pivot_df,to_predict, semanas_numbers,t_0):
    pivot_df = pivot_df.copy()
    semanas_names = ['Sem%i' %i for i in semanas_numbers]
    columns = ['Sem%i' %i for i in semanas_numbers]
    columns.append('Producto_ID')
    pivot_grouped = pivot_df[columns].groupby('Producto_ID').aggregate('mean')
    pivot_grouped['LR_prod'] = np.zeros(len(pivot_grouped))
    pivot_grouped['LR_prod'] = pivot_grouped[semanas_names].apply(lin_regr, axis = 1,
                                                                  to_predict = to_predict,
                                                                  t_0 = t_0, semanas_numbers = semanas_numbers )
    pivot_df = pd.merge(pivot_df, pivot_grouped[['LR_prod']], how='left', left_on = 'Producto_ID', right_index=True)
    pivot_df['LR_prod_corr'] = pivot_df['LR_prod'] * pivot_df['cliente_for_log_sum'] / 100
    return pivot_df

In [15]:
cliente_tabla = pd.read_csv('origin/cliente_tabla.csv')
town_state = pd.read_csv('origin/town_state.csv')

In [16]:
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)

In [17]:
def add_pro_info(dataset):
    train_basic_feature = dataset[['Cliente_ID','Producto_ID','Agencia_ID']].copy()
    train_basic_feature.drop_duplicates(inplace = True)

    cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
#     print cliente_per_town.shape
    cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )

#     print cliente_per_town.shape

    cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
#     print cliente_per_town_count.head()

    cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','town_id','Agencia_ID']],
                                            cliente_per_town_count,on = 'town_id',how = 'inner')

#     print cliente_per_town_count_final.head()
    cliente_per_town_count_final.drop_duplicates(inplace = True)
    
    dataset_final = pd.merge(dataset,cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente','Agencia_ID']],
                             on = ['Cliente_ID','Producto_ID','Agencia_ID'],how = 'left')
    return dataset_final

In [18]:
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)

pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')

In [19]:
def add_product(dataset):

    dataset = pd.merge(dataset,pre_product[['ID','weight','weight_per_piece','pieces']],
                                                             left_on = 'Producto_ID',right_on = 'ID',how = 'left')
    return dataset

data for predict week [34567----9], time plus 2 week



In [ ]:
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()

train_pivot_34567_to_9 = categorical_useful(train_34567,train_pivot_34567_to_9)

del train_34567
gc.collect()

train_pivot_34567_to_9 = define_time_features(train_pivot_34567_to_9, to_predict = 't_plus_2' , t_0 = 9)
train_pivot_34567_to_9 = lin_regr_features(train_pivot_34567_to_9,to_predict ='t_plus_2',
                                           semanas_numbers = [3,4,5,6,7],t_0 = 9)

train_pivot_34567_to_9['target'] = train_pivot_34567_to_9['Sem9']
train_pivot_34567_to_9.drop(['Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_9[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)

train_pivot_34567_to_9.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)

train_pivot_34567_to_9 = pd.concat([train_pivot_34567_to_9,train_pivot_cum_sum],axis =1)

train_pivot_34567_to_9 = train_pivot_34567_to_9.rename(columns={'Sem3': 't_m_6_cum',
                                          'Sem4': 't_m_5_cum','Sem5': 't_m_4_cum',
                                                                'Sem6': 't_m_3_cum','Sem7': 't_m_2_cum'})
# add geo_info
train_pivot_34567_to_9 = add_pro_info(train_pivot_34567_to_9)

#add product info
train_pivot_34567_to_9 = add_product(train_pivot_34567_to_9)
train_pivot_34567_to_9.drop(['ID'],axis = 1,inplace = True)

gc.collect()

train_pivot_34567_to_9.head()

In [ ]:


In [ ]:


In [ ]:


In [ ]:

data for predict week 8&9, time plus 1 week


train_45678 for 8+1 =9



In [15]:
train_45678 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7,8]), :].copy()
train_pivot_45678_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()

train_pivot_45678_to_9 = categorical_useful(train_45678,train_pivot_45678_to_9)

del train_45678
gc.collect()

train_pivot_45678_to_9 = define_time_features(train_pivot_45678_to_9, to_predict = 't_plus_1' , t_0 = 9)
train_pivot_45678_to_9 = lin_regr_features(train_pivot_45678_to_9,to_predict ='t_plus_1',
                                           semanas_numbers = [4,5,6,7,8],t_0 = 8)

train_pivot_45678_to_9['target'] = train_pivot_45678_to_9['Sem9']
train_pivot_45678_to_9.drop(['Sem3','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_45678_to_9[['Sem4','Sem5','Sem6','Sem7','Sem8']].cumsum(axis = 1)

train_pivot_45678_to_9.drop(['Sem4','Sem5','Sem6','Sem7','Sem8'],axis =1,inplace = True)

train_pivot_45678_to_9 = pd.concat([train_pivot_45678_to_9,train_pivot_cum_sum],axis =1)

train_pivot_45678_to_9 = train_pivot_45678_to_9.rename(columns={'Sem4': 't_m_5_cum',
                                          'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum','Sem8': 't_m_1_cum'})
# add geo_info
train_pivot_45678_to_9 = add_pro_info(train_pivot_45678_to_9)

#add product info
train_pivot_45678_to_9 = add_product(train_pivot_45678_to_9)
train_pivot_45678_to_9.drop(['ID'],axis = 1,inplace = True)

gc.collect()

train_pivot_45678_to_9.head()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-15-b7f9f571d438> in <module>()
----> 1 train_45678 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7,8]), :].copy()
      2 train_pivot_45678_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()
      3 
      4 train_pivot_45678_to_9 = categorical_useful(train_45678,train_pivot_45678_to_9)
      5 

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in isin(self, values)
   2413 
   2414         """
-> 2415         result = algos.isin(_values_from_object(self), values)
   2416         return self._constructor(result, index=self.index).__finalize__(self)
   2417 

/usr/local/lib/python2.7/dist-packages/pandas/core/algorithms.pyc in isin(comps, values)
    140         f = lambda x, y: lib.ismember(x, set(values))
    141 
--> 142     return f(comps, values)
    143 
    144 

/usr/local/lib/python2.7/dist-packages/pandas/core/algorithms.pyc in <lambda>(x, y)
    138         pass
    139     else:
--> 140         f = lambda x, y: lib.ismember(x, set(values))
    141 
    142     return f(comps, values)

KeyboardInterrupt: 

In [42]:
train_pivot_45678_to_9.columns.values


Out[42]:
array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [47]:
train_pivot_45678_to_9.to_csv('train_pivot_45678_to_9.csv')

In [ ]:
train_pivot_45678_to_9 = pd.read_csv('train_pivot_45678_to_9.csv',index_col = 0)

In [47]:
train_pivot_45678_to_9.to_csv('train_pivot_45678_to_9_new.csv')

In [18]:
train_pivot_45678_to_9 = pd.read_csv('train_pivot_45678_to_9_new.csv',index_col = 0)

train_34567 7+1 = 8



In [16]:
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_8 = pivot_train.loc[(pivot_train['Sem8'].notnull()),:].copy()

train_pivot_34567_to_8 = categorical_useful(train_34567,train_pivot_34567_to_8)

del train_34567
gc.collect()

train_pivot_34567_to_8 = define_time_features(train_pivot_34567_to_8, to_predict = 't_plus_1' , t_0 = 8)
train_pivot_34567_to_8 = lin_regr_features(train_pivot_34567_to_8,to_predict = 't_plus_1',
                                           semanas_numbers = [3,4,5,6,7],t_0 = 7)

train_pivot_34567_to_8['target'] = train_pivot_34567_to_8['Sem8']
train_pivot_34567_to_8.drop(['Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_8[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)

train_pivot_34567_to_8.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)

train_pivot_34567_to_8 = pd.concat([train_pivot_34567_to_8,train_pivot_cum_sum],axis =1)

train_pivot_34567_to_8 = train_pivot_34567_to_8.rename(columns={'Sem3': 't_m_5_cum','Sem4': 't_m_4_cum',
                                          'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum',
                                                                'Sem7': 't_m_1_cum'})
# add product_info
train_pivot_34567_to_8 = add_pro_info(train_pivot_34567_to_8)

#add product

train_pivot_34567_to_8 = add_product(train_pivot_34567_to_8)

train_pivot_34567_to_8.drop(['ID'],axis = 1,inplace = True)

gc.collect()

train_pivot_34567_to_8.head()


Out[16]:
Semana Cliente_ID Producto_ID Agencia_ID Canal_ID Ruta_SAK ruta_freq clien_freq agen_freq prod_freq agen_for_log_de ... target t_m_5_cum t_m_4_cum t_m_3_cum t_m_2_cum t_m_1_cum NombreCliente weight weight_per_piece pieces
0 26 1182 2061 2 7212 548.0 160.0 12574 411493.0 3.489749 ... 0.000000 3.688879 NaN NaN NaN 3.688879 18434 210.0 210.00 1.0
1 26 4767 2061 2 7212 548.0 160.0 12574 112080.0 3.489749 ... 3.761200 3.761200 NaN NaN NaN 7.522400 18434 250.0 NaN NaN
2 26 31393 2061 2 7212 548.0 160.0 12574 15736.0 3.489749 ... 3.135494 3.044522 5.877736 8.650325 11.422913 14.367352 18434 640.0 NaN NaN
3 26 34204 2061 2 7212 548.0 160.0 12574 17261.0 3.489749 ... 3.828641 3.784190 7.218177 11.024839 14.580187 18.191105 18434 450.0 56.25 8.0
4 26 34206 2061 2 7212 548.0 160.0 12574 87803.0 3.489749 ... 4.499810 4.795791 9.202510 12.963710 17.212205 21.786916 18434 340.0 42.50 8.0

5 rows × 49 columns


In [49]:
train_pivot_34567_to_8.columns.values


Out[49]:
array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces'], dtype=object)

In [50]:
train_pivot_34567_to_8.to_csv('train_pivot_34567_to_8.csv')

In [5]:
train_pivot_34567_to_8 = pd.read_csv('train_pivot_34567_to_8.csv',index_col = 0)

In [20]:
gc.collect()


Out[20]:
42

concat train_pivot_45678_to_9 & train_pivot_34567_to_8 to perform t_plus_1, train_data is over



In [22]:
train_pivot_xgb_time1 = pd.concat([train_pivot_45678_to_9, train_pivot_34567_to_8],axis = 0,copy = False)

In [23]:
train_pivot_xgb_time1.columns.values


Out[23]:
array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [24]:
train_pivot_xgb_time1.shape


Out[24]:
(20768652, 49)

In [25]:
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1_44fea.csv')

In [54]:
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1.csv')

In [26]:
del train_pivot_xgb_time1
del train_pivot_45678_to_9
del train_pivot_34567_to_8
gc.collect()


Out[26]:
16

prepare for test data, for week 10, we use 5,6,7,8,9



In [7]:
pivot_test.head()


Out[7]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID Canal_ID Ruta_SAK id Semana
0 26 31518 NaN 2.397895 NaN NaN NaN NaN NaN 2655 2 4189 1569352 10
1 26 34206 4.795791 4.406719 3.76120 4.248495 4.574711 4.499810 4.682131 2061 2 7212 1547831 11
2 26 34210 2.890372 3.761200 3.78419 3.496508 3.433987 3.688879 3.688879 2061 2 7212 6667200 10
3 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 1592616 10
4 26 34785 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 2061 2 7212 6825659 11

In [20]:
pivot_test_week10 = pivot_test.loc[pivot_test['sem10_sem11'] == 10]
pivot_test_week10.reset_index(drop=True,inplace = True)
pivot_test_week10.head()


Out[20]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 id sem10_sem11 Agencia_ID Canal_ID Ruta_SAK
0 26.0 31518.0 NaN 2.397895 NaN NaN NaN NaN NaN 1569352 10 2655 2 4189
1 26.0 34210.0 2.890372 3.761200 3.784190 3.496508 3.433987 3.688879 3.688879 6667200 10 2061 2 7212
2 26.0 34785.0 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 1592616 10 2061 2 7212
3 26.0 34786.0 4.356709 4.158883 4.430817 3.713572 4.317488 4.094345 4.330733 3909690 10 2061 2 7212
4 26.0 35142.0 3.258097 1.945910 3.583519 3.555348 3.688879 3.828641 3.663562 3659672 10 2061 2 7212

In [21]:
pivot_test_week10.shape


Out[21]:
(3538385, 14)

In [22]:
train_56789 = train_dataset.loc[train_dataset['Semana'].isin([5,6,7,8,9]), :].copy()
train_pivot_56789_to_10 = pivot_test_week10.copy()

train_pivot_56789_to_10 = categorical_useful(train_56789,train_pivot_56789_to_10)

del train_56789
gc.collect()

train_pivot_56789_to_10 = define_time_features(train_pivot_56789_to_10, to_predict = 't_plus_1' , t_0 = 10)

train_pivot_56789_to_10 = lin_regr_features(train_pivot_56789_to_10,to_predict ='t_plus_1' , 
                                            semanas_numbers = [5,6,7,8,9],t_0 = 9)

train_pivot_56789_to_10.drop(['Sem3','Sem4'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_56789_to_10[['Sem5','Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)

train_pivot_56789_to_10.drop(['Sem5','Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)

train_pivot_56789_to_10 = pd.concat([train_pivot_56789_to_10,train_pivot_cum_sum],axis =1)


train_pivot_56789_to_10 = train_pivot_56789_to_10.rename(columns={'Sem5': 't_m_5_cum',
                                          'Sem6': 't_m_4_cum','Sem7': 't_m_3_cum',
                                                                  'Sem8': 't_m_2_cum','Sem9': 't_m_1_cum'})
# add product_info
train_pivot_56789_to_10 = add_pro_info(train_pivot_56789_to_10)

#
train_pivot_56789_to_10 = add_product(train_pivot_56789_to_10)

train_pivot_56789_to_10.drop(['ID'],axis =1,inplace = True)

train_pivot_56789_to_10.head()


Out[22]:
Semana Cliente_ID Producto_ID id sem10_sem11 Agencia_ID Canal_ID Ruta_SAK ruta_freq clien_freq agen_freq ... LR_prod_corr t_m_5_cum t_m_4_cum t_m_3_cum t_m_2_cum t_m_1_cum NombreCliente weight weight_per_piece pieces
0 26.0 31518.0 1569352 10 2655 2 4189 10.0 166.0 3604 ... 18.459131 NaN NaN NaN NaN NaN 131 691.0 NaN NaN
1 26.0 34210.0 6667200 10 2061 2 7212 713.0 166.0 12208 ... 21.088184 3.784190 7.280697 10.714684 14.403564 18.092443 6027 740.0 NaN NaN
2 26.0 34785.0 1592616 10 2061 2 7212 713.0 166.0 12208 ... 19.477601 NaN 2.833213 6.165418 8.730367 11.949243 6027 480.0 NaN NaN
3 26.0 34786.0 3909690 10 2061 2 7212 713.0 166.0 12208 ... 22.144868 4.430817 8.144389 12.461877 16.556222 20.886955 6027 680.0 NaN NaN
4 26.0 35142.0 3659672 10 2061 2 7212 713.0 166.0 12208 ... 15.688445 3.583519 7.138867 10.827746 14.656388 18.319949 6027 567.0 NaN NaN

5 rows × 50 columns


In [23]:
train_pivot_56789_to_10.columns.values


Out[23]:
array(['Cliente_ID', 'Producto_ID', 'id', 'sem10_sem11', 'Agencia_ID',
       'Canal_ID', 'Ruta_SAK', 'ruta_freq', 'clien_freq', 'agen_freq',
       'prod_freq', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_1', 't_min_2',
       't_min_3', 't_min_4', 't_min_5', 't1_min_t2', 't1_min_t3',
       't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum', 't_m_1_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [24]:
train_pivot_56789_to_10.to_pickle('train_pivot_56789_to_10_44fea.pickle')

begin predict for week 11


train_3456 for 6+2 = 8



In [27]:
train_3456 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6]), :].copy()
train_pivot_3456_to_8 = pivot_train.loc[(pivot_train['Sem8'].notnull()),:].copy()

train_pivot_3456_to_8 = categorical_useful(train_3456,train_pivot_3456_to_8)

del train_3456
gc.collect()

train_pivot_3456_to_8 = define_time_features(train_pivot_3456_to_8, to_predict = 't_plus_2' , t_0 = 8)

#notice that the t_0 means different
train_pivot_3456_to_8 = lin_regr_features(train_pivot_3456_to_8,to_predict = 't_plus_2', semanas_numbers = [3,4,5,6],t_0 = 6)

train_pivot_3456_to_8['target'] = train_pivot_3456_to_8['Sem8']
train_pivot_3456_to_8.drop(['Sem7','Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_3456_to_8[['Sem3','Sem4','Sem5','Sem6']].cumsum(axis = 1)

train_pivot_3456_to_8.drop(['Sem3','Sem4','Sem5','Sem6'],axis =1,inplace = True)

train_pivot_3456_to_8 = pd.concat([train_pivot_3456_to_8,train_pivot_cum_sum],axis =1)

train_pivot_3456_to_8 = train_pivot_3456_to_8.rename(columns={'Sem4': 't_m_4_cum',
                                          'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum', 'Sem3': 't_m_5_cum'})
# add product_info
train_pivot_3456_to_8 = add_pro_info(train_pivot_3456_to_8)

train_pivot_3456_to_8 = add_product(train_pivot_3456_to_8)

train_pivot_3456_to_8.drop(['ID'],axis =1,inplace = True)

train_pivot_3456_to_8.head()


Out[27]:
Semana Cliente_ID Producto_ID Agencia_ID Canal_ID Ruta_SAK ruta_freq clien_freq agen_freq prod_freq agen_for_log_de ... target t_m_5_cum t_m_4_cum t_m_3_cum t_m_2_cum NombreCliente ID weight weight_per_piece pieces
0 26 1182 2061 2 7212 410.0 127.0 10085 320507.0 3.491654 ... 0.000000 3.688879 NaN NaN NaN 18434 1182 210.0 210.00 1.0
1 26 4767 2061 2 7212 410.0 127.0 10085 88159.0 3.491654 ... 3.761200 3.761200 NaN NaN NaN 18434 4767 250.0 NaN NaN
2 26 31393 2061 2 7212 410.0 127.0 10085 12237.0 3.491654 ... 3.135494 3.044522 5.877736 8.650325 11.422913 18434 31393 640.0 NaN NaN
3 26 34204 2061 2 7212 410.0 127.0 10085 13814.0 3.491654 ... 3.828641 3.784190 7.218177 11.024839 14.580187 18434 34204 450.0 56.25 8.0
4 26 34206 2061 2 7212 410.0 127.0 10085 71868.0 3.491654 ... 4.499810 4.795791 9.202510 12.963710 17.212205 18434 34206 340.0 42.50 8.0

5 rows × 44 columns


In [30]:
train_pivot_3456_to_8.columns.values


Out[30]:
array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [58]:
train_pivot_3456_to_8.to_csv('train_pivot_3456_to_8.csv')

train_4567 for 7 + 2 = 9


In [28]:
train_4567 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7]), :].copy()
train_pivot_4567_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()

train_pivot_4567_to_9 = categorical_useful(train_4567,train_pivot_4567_to_9)

del train_4567
gc.collect()

train_pivot_4567_to_9 = define_time_features(train_pivot_4567_to_9, to_predict = 't_plus_2' , t_0 = 9)

#notice that the t_0 means different
train_pivot_4567_to_9 = lin_regr_features(train_pivot_4567_to_9,to_predict = 't_plus_2', 
                                          semanas_numbers = [4,5,6,7],t_0 = 7)

train_pivot_4567_to_9['target'] = train_pivot_4567_to_9['Sem9']
train_pivot_4567_to_9.drop(['Sem3','Sem8','Sem9'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_4567_to_9[['Sem7','Sem4','Sem5','Sem6']].cumsum(axis = 1)

train_pivot_4567_to_9.drop(['Sem7','Sem4','Sem5','Sem6'],axis =1,inplace = True)

train_pivot_4567_to_9 = pd.concat([train_pivot_4567_to_9,train_pivot_cum_sum],axis =1)

train_pivot_4567_to_9 = train_pivot_4567_to_9.rename(columns={'Sem4': 't_m_5_cum',
                                          'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum'})
# add product_info
train_pivot_4567_to_9 = add_pro_info(train_pivot_4567_to_9)

train_pivot_4567_to_9 = add_product(train_pivot_4567_to_9)

train_pivot_4567_to_9.drop(['ID'],axis =1,inplace = True)

train_pivot_4567_to_9.head()


Out[28]:
Semana Cliente_ID Producto_ID Agencia_ID Canal_ID Ruta_SAK ruta_freq clien_freq agen_freq prod_freq agen_for_log_de ... LR_prod_corr target t_m_2_cum t_m_5_cum t_m_4_cum t_m_3_cum NombreCliente weight weight_per_piece pieces
0 26 30235 2655 2 4189 10.0 126.0 3322 1670.0 3.939269 ... 12.376057 4.574711 NaN NaN NaN NaN 233 600.0 75.00 8.0
1 26 31393 2061 2 7212 518.0 126.0 9992 12729.0 3.502839 ... 6.542010 2.639057 2.944439 5.777652 8.550241 11.322830 18989 640.0 NaN NaN
2 26 32962 2061 2 7212 518.0 126.0 9992 5940.0 3.502839 ... 10.276987 2.397895 NaN NaN NaN NaN 18989 180.0 30.00 6.0
3 26 34204 2061 2 7212 518.0 126.0 9992 13625.0 3.502839 ... 11.267610 3.784190 3.610918 7.044905 10.851568 14.406916 18989 450.0 56.25 8.0
4 26 34206 2061 2 7212 518.0 126.0 9992 62752.0 3.502839 ... 7.308716 4.682131 4.574711 8.981430 12.742630 16.991126 18989 340.0 42.50 8.0

5 rows × 43 columns


In [31]:
train_pivot_4567_to_9.columns.values


Out[31]:
array(['Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'ruta_freq', 'clien_freq', 'agen_freq', 'prod_freq',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'corr', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod', 'LR_prod_corr',
       'target', 't_m_2_cum', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [16]:
train_pivot_4567_to_9.to_csv('train_pivot_4567_to_9.csv')

concat



In [32]:
train_pivot_xgb_time2 = pd.concat([train_pivot_3456_to_8, train_pivot_4567_to_9],axis = 0,copy = False)

In [33]:
train_pivot_xgb_time2.columns.values


Out[33]:
array(['Agencia_ID', 'Canal_ID', 'Cliente_ID', 'LR_prod', 'LR_prod_corr',
       'NombreCliente', 'Producto_ID', 'Ruta_SAK',
       'agen_cliente_for_log_de', 'agen_for_log_de', 'agen_freq',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de', 'clien_freq',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_agen_for_log_sum', 'cliente_producto_for_log_de',
       'corr', 'pieces', 'prod_freq', 'producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_for_log_de', 'ruta_freq',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'target', 'weight', 'weight_per_piece'], dtype=object)

In [34]:
train_pivot_xgb_time2.shape


Out[34]:
(20768652, 43)

In [35]:
train_pivot_xgb_time2.to_csv('train_pivot_xgb_time2_38fea.csv')

In [66]:
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)
train_pivot_xgb_time2.head()


Out[66]:
Agencia_ID Canal_ID Cliente_ID LR_prod LR_prod_corr NombreCliente Producto_ID Ruta_SAK agen_cliente_for_log_de agen_for_log_de ... t_m_3_cum t_m_4_cum t_m_5_cum t_min_2 t_min_3 t_min_4 t_min_5 target weight weight_per_piece
0 2061 2 26 2.001190 7.293554 18434 1182 7212 2.852285 3.491654 ... NaN NaN 3.688879 NaN NaN NaN 3.688879 0.000000 210.0 210.00
1 2061 2 26 1.839411 6.703932 18434 4767 7212 2.852285 3.491654 ... NaN NaN 3.761200 NaN NaN NaN 3.761200 3.761200 250.0 NaN
2 2061 2 26 1.911283 6.965878 18434 31393 7212 2.852285 3.491654 ... 8.650325 5.877736 3.044522 2.772589 2.772589 2.833213 3.044522 3.135494 640.0 NaN
3 2061 2 26 3.113374 11.347029 18434 34204 7212 2.852285 3.491654 ... 11.024839 7.218177 3.784190 3.555348 3.806662 3.433987 3.784190 3.828641 450.0 56.25
4 2061 2 26 2.031231 7.403043 18434 34206 7212 2.852285 3.491654 ... 12.963710 9.202510 4.795791 4.248495 3.761200 4.406719 4.795791 4.499810 340.0 42.50

5 rows × 38 columns


In [72]:
del train_pivot_3456_to_8
del train_pivot_4567_to_9
del train_pivot_xgb_time2
del train_pivot_34567_to_8
del train_pivot_45678_to_9
del train_pivot_xgb_time1
gc.collect()


Out[72]:
119

for test data week 11, we use 6,7,8,9



In [29]:
pivot_test_week11 = pivot_test_new.loc[pivot_test_new['Semana'] == 11]
pivot_test_week11.reset_index(drop=True,inplace = True)
pivot_test_week11.head()


Out[29]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 id Semana Agencia_ID Canal_ID Ruta_SAK
0 26.0 34206.0 4.795791 4.406719 3.761200 4.248495 4.574711 4.499810 4.682131 1547831 11 2061 2 7212
1 26.0 34785.0 3.091042 NaN NaN 2.833213 3.332205 2.564949 3.218876 6825659 11 2061 2 7212
2 26.0 35148.0 2.772589 2.772589 1.791759 2.772589 2.397895 3.044522 2.995732 5853787 11 2061 2 7212
3 26.0 43335.0 1.945910 1.386294 NaN NaN NaN 0.000000 1.386294 2316053 11 2061 2 7212
4 26.0 43342.0 1.386294 2.197225 1.098612 NaN NaN 1.791759 3.044522 900676 11 2061 2 7212

In [83]:
pivot_test_week11.shape


Out[83]:
(2684949, 14)

In [30]:
train_6789 = train_dataset.loc[train_dataset['Semana'].isin([6,7,8,9]), :].copy()
train_pivot_6789_to_11 = pivot_test_week11.copy()

train_pivot_6789_to_11 = categorical_useful(train_6789,train_pivot_6789_to_11)

del train_6789
gc.collect()

train_pivot_6789_to_11 = define_time_features(train_pivot_6789_to_11, to_predict = 't_plus_2' , t_0 = 11)
train_pivot_6789_to_11 = lin_regr_features(train_pivot_6789_to_11,to_predict ='t_plus_2' ,
                                           semanas_numbers = [6,7,8,9],t_0 = 9)

train_pivot_6789_to_11.drop(['Sem3','Sem4','Sem5'],axis =1,inplace = True)

#add cum_sum
train_pivot_cum_sum = train_pivot_6789_to_11[['Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)

train_pivot_6789_to_11.drop(['Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)

train_pivot_6789_to_11 = pd.concat([train_pivot_6789_to_11,train_pivot_cum_sum],axis =1)

train_pivot_6789_to_11 = train_pivot_6789_to_11.rename(columns={'Sem6': 't_m_5_cum',
                                          'Sem7': 't_m_4_cum', 'Sem8': 't_m_3_cum','Sem9': 't_m_2_cum'})
# add product_info
train_pivot_6789_to_11 = add_pro_info(train_pivot_6789_to_11)

train_pivot_6789_to_11 = add_product(train_pivot_6789_to_11)

train_pivot_6789_to_11.drop(['ID'],axis = 1,inplace = True)

train_pivot_6789_to_11.head()


Out[30]:
Semana Cliente_ID Producto_ID id Semana Agencia_ID Canal_ID Ruta_SAK agen_for_log_de ruta_for_log_de cliente_for_log_de ... LR_prod LR_prod_corr t_m_5_cum t_m_4_cum t_m_3_cum t_m_2_cum NombreCliente weight weight_per_piece pieces
0 26.0 34206.0 1547831 11 2061 2 7212 3.535522 3.224378 2.939548 ... 1.891621 7.506688 4.248495 8.823206 13.323016 18.005147 5954 340.0 42.500 8.0
1 26.0 34785.0 6825659 11 2061 2 7212 3.535522 3.224378 2.939548 ... 4.016822 15.940314 2.833213 6.165418 8.730367 11.949243 5954 480.0 NaN NaN
2 26.0 35148.0 5853787 11 2061 2 7212 3.535522 3.224378 2.939548 ... 2.707100 10.742829 2.772589 5.170484 8.215006 11.210739 5954 567.0 NaN NaN
3 26.0 43335.0 2316053 11 2061 2 7212 3.535522 3.224378 2.939548 ... 2.513660 9.975183 NaN NaN 0.000000 1.386294 5954 435.0 54.375 8.0
4 26.0 43342.0 900676 11 2061 2 7212 3.535522 3.224378 2.939548 ... 2.846363 11.295476 NaN NaN 1.791759 4.836282 5954 248.0 62.000 4.0

5 rows × 39 columns


In [85]:
train_pivot_6789_to_11.shape


Out[85]:
(2684949, 39)

In [31]:
train_pivot_6789_to_11.to_pickle('train_pivot_6789_to_11_new.pickle')

over



In [27]:
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)

In [ ]:
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)

In [34]:
pivot_train_categorical_useful_train.to_csv('pivot_train_categorical_useful_with_nan.csv')

In [2]:
pivot_train_categorical_useful_train = pd.read_csv('pivot_train_categorical_useful_with_nan.csv',index_col = 0)
pivot_train_categorical_useful_train.head()


Out[2]:
3 4 5 6 7 8 9 Canal_ID agen_for_log_de ruta_for_log_de cliente_for_log_de producto_for_log_de agen_ruta_for_log_de agen_cliente_for_log_de agen_producto_for_log_de ruta_cliente_for_log_de ruta_producto_for_log_de cliente_producto_for_log_de cliente_for_log_sum
0 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2 3.489749 3.208232 2.897803 1.766994 2.877003 2.877003 4.370318 2.877003 3.694196 1.84444 463.648478
1 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2 3.489749 3.208232 2.897803 1.482496 2.877003 2.877003 4.253513 2.877003 4.164880 3.76120 463.648478
2 NaN NaN NaN NaN NaN NaN 4.574711 2 3.936934 3.095841 2.897803 3.106304 3.095841 3.015667 3.569138 3.095841 NaN NaN 463.648478
3 NaN NaN NaN NaN 3.891820 NaN NaN 2 3.936934 3.095841 2.897803 3.878160 3.095841 3.015667 3.943380 3.095841 3.891820 3.89182 463.648478
4 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2 3.489749 3.208232 2.897803 1.563950 2.877003 2.877003 3.333444 2.877003 3.013416 2.87347 463.648478

create time feature



In [13]:
pivot_train_categorical_useful.head()


Out[13]:
Semana Cliente_ID Producto_ID 3 4 5 6 7 8 9 Agencia_ID ... cliente_for_log_de producto_for_log_de agen_ruta_for_log_de agen_cliente_for_log_de agen_producto_for_log_de ruta_cliente_for_log_de ruta_producto_for_log_de cliente_producto_for_log_de cliente_for_log_sum corr
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... 2.897803 1.766994 2.877003 2.877003 4.370318 2.877003 3.694196 1.84444 463.648478 3.693589
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... 2.897803 1.482496 2.877003 2.877003 4.253513 2.877003 4.164880 3.76120 463.648478 3.098896
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... 2.897803 3.106304 3.095841 3.015667 3.569138 3.095841 NaN NaN 463.648478 6.493178
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... 2.897803 3.878160 3.095841 3.015667 3.943380 3.095841 3.891820 3.89182 463.648478 8.106606
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... 2.897803 1.563950 2.877003 2.877003 3.333444 2.877003 3.013416 2.87347 463.648478 3.269161

5 rows × 24 columns


In [19]:
pivot_train_categorical_useful_time = define_time_features(pivot_train_categorical_useful,
                                                                 to_predict = 't_plus_1' , t_0 = 8)

In [20]:
pivot_train_categorical_useful_time.head()


Out[20]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t1_min_t2 t1_min_t3 t1_min_t4 t1_min_t5 t2_min_t3 t2_min_t4 t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN NaN NaN -3.688879 NaN NaN NaN NaN NaN NaN
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN NaN NaN 0.000000 NaN NaN NaN NaN NaN NaN
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... 0.17185 0.17185 0.111226 -0.100083 0.0 -0.060625 -0.271934 -0.060625 -0.271934 -0.211309

5 rows × 39 columns


In [22]:
pivot_train_categorical_useful_time.columns


Out[22]:
Index([u'Cliente_ID', u'Producto_ID', u'Sem3', u'Sem4', u'Sem5', u'Sem6',
       u'Sem7', u'Sem8', u'Sem9', u'Agencia_ID', u'Canal_ID', u'Ruta_SAK',
       u'agen_for_log_de', u'ruta_for_log_de', u'cliente_for_log_de',
       u'producto_for_log_de', u'agen_ruta_for_log_de',
       u'agen_cliente_for_log_de', u'agen_producto_for_log_de',
       u'ruta_cliente_for_log_de', u'ruta_producto_for_log_de',
       u'cliente_producto_for_log_de', u'cliente_for_log_sum', u'corr',
       u't_min_1', u't_min_2', u't_min_3', u't_min_4', u't_min_5',
       u't1_min_t2', u't1_min_t3', u't1_min_t4', u't1_min_t5', u't2_min_t3',
       u't2_min_t4', u't2_min_t5', u't3_min_t4', u't3_min_t5', u't4_min_t5'],
      dtype='object', name=u'Semana')

fit mean feature on target



In [33]:
# Linear regression features 
pivot_train_categorical_useful_time_LR = lin_regr_features(pivot_train_categorical_useful_time, semanas_numbers = [3,4,5,6,7])
pivot_train_categorical_useful_time_LR.head()


Out[33]:
Semana Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t1_min_t4 t1_min_t5 t2_min_t3 t2_min_t4 t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5 LR_prod LR_prod_corr
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN -3.688879 NaN NaN NaN NaN NaN NaN 1.775590 8.232495
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN 0.000000 NaN NaN NaN NaN NaN NaN 1.478578 6.855404
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN 3.099887 14.372579
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN 3.966329 18.389824
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... 0.111226 -0.100083 0.0 -0.060625 -0.271934 -0.060625 -0.271934 -0.211309 1.504393 6.975095

5 rows × 41 columns


In [34]:
pivot_train_categorical_useful_time_LR.columns


Out[34]:
Index([u'Cliente_ID', u'Producto_ID', u'Sem3', u'Sem4', u'Sem5', u'Sem6',
       u'Sem7', u'Sem8', u'Sem9', u'Agencia_ID', u'Canal_ID', u'Ruta_SAK',
       u'agen_for_log_de', u'ruta_for_log_de', u'cliente_for_log_de',
       u'producto_for_log_de', u'agen_ruta_for_log_de',
       u'agen_cliente_for_log_de', u'agen_producto_for_log_de',
       u'ruta_cliente_for_log_de', u'ruta_producto_for_log_de',
       u'cliente_producto_for_log_de', u'cliente_for_log_sum', u'corr',
       u't_min_1', u't_min_2', u't_min_3', u't_min_4', u't_min_5',
       u't1_min_t2', u't1_min_t3', u't1_min_t4', u't1_min_t5', u't2_min_t3',
       u't2_min_t4', u't2_min_t5', u't3_min_t4', u't3_min_t5', u't4_min_t5',
       u'LR_prod', u'LR_prod_corr'],
      dtype='object', name=u'Semana')

In [42]:
pivot_train_categorical_useful_time_LR.to_csv('pivot_train_categorical_useful_time_LR.csv')

In [16]:
pivot_train_categorical_useful_time_LR = pd.read_csv('pivot_train_categorical_useful_time_LR.csv',index_col = 0)

In [17]:
pivot_train_categorical_useful_time_LR.head()


Out[17]:
Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t1_min_t4 t1_min_t5 t2_min_t3 t2_min_t4 t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5 LR_prod LR_prod_corr
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN -3.688879 NaN NaN NaN NaN NaN NaN 1.775590 8.232495
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN 0.000000 NaN NaN NaN NaN NaN NaN 1.478578 6.855404
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN 3.099887 14.372579
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN NaN NaN NaN NaN 3.966329 18.389824
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... 0.111226 -0.100083 0.0 -0.060625 -0.271934 -0.060625 -0.271934 -0.211309 1.504393 6.975095

5 rows × 41 columns

add dummy feature



In [4]:
# pivot_train_canal = pd.get_dummies(pivot_train_categorical_useful_train['Canal_ID'])

In [5]:
# pivot_train_categorical_useful_train = pivot_train_categorical_useful_train.join(pivot_train_canal)
# pivot_train_categorical_useful_train.head()


Out[5]:
Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Canal_ID agen_for_log_de ruta_for_log_de ... LR_prod_corr 1 2 4 5 6 7 8 9 11
0 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2 3.489749 3.208232 ... 8.232495 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2 3.489749 3.208232 ... 6.855404 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 NaN NaN NaN NaN NaN NaN 4.574711 2 3.936934 3.095841 ... 14.372579 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 NaN NaN NaN NaN 3.891820 NaN NaN 2 3.936934 3.095841 ... 18.389824 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2 3.489749 3.208232 ... 6.975095 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 46 columns

add product feature



In [2]:
%ls


0_prepocess_data.ipynb*            data_process.py*
1_xgboost.ipynb*                   origin/
agencia_for_cliente_producto.csv*  pivot_test.pickle*
baseline.ipynb*                    pivot_train_categorical_useful_time_LR.csv*
bimbo_perceptron/                  pivot_train_with_nan.pickle*
bimbo.py*                          preprocessed_products.csv*
bimbo.pyc*                         rnn.py*
canal_for_cliente_producto.csv*    ruta_for_cliente_producto.csv*
data_helper.py*                    train.py*
data_helper.pyc*

In [4]:
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)
pre_product.head()


pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')

pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
                                                         pre_product[['ID','weight','weight_per_piece']],
                                                         left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()


Out[4]:
ID product_name brand weight pieces weight_per_piece has_choco has_vanilla has_multigrain
1 0 NO IDENTIFICADO 0 IDENTIFICADO 0.0 0.0 0 False False False
2 9 Capuccino Moka 750g NES 9 NES 750.0 1.0 1 False False False
3 41 Bimbollos Ext sAjonjoli 6p 480g BIM 41 BIM 480.0 6.0 80 False False False
4 53 Burritos Sincro 170g CU LON 53 LON 170.0 1.0 17- False False False
5 72 Div Tira Mini Doradita 4p 45g TR 72 TR 45.0 4.0 11.25 False False False

In [15]:


In [18]:
pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
                                                         pre_product[['ID','weight','weight_per_piece']],
                                                         left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()


Out[18]:
Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t2_min_t4 t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5 LR_prod LR_prod_corr ID weight weight_per_piece
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN NaN NaN NaN NaN 1.775590 8.232495 1182 210.0 210.0
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN NaN NaN NaN NaN 1.478578 6.855404 4767 250.0 NaN
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN NaN 3.099887 14.372579 30235 600.0 75.0
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN NaN 3.966329 18.389824 30314 360.0 NaN
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... -0.060625 -0.271934 -0.060625 -0.271934 -0.211309 1.504393 6.975095 31393 640.0 NaN

5 rows × 44 columns


In [50]:
pivot_train_categorical_useful_time_LR_weight.to_csv('pivot_train_categorical_useful_time_LR_weight.csv')

In [2]:
pivot_train_categorical_useful_time_LR_weight = pd.read_csv('pivot_train_categorical_useful_time_LR_weight.csv',index_col = 0)
pivot_train_categorical_useful_time_LR_weight.head()


Out[2]:
Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t2_min_t4 t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5 LR_prod LR_prod_corr ID weight weight_per_piece
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN NaN NaN NaN NaN 1.775590 8.232495 1182 210.0 210.0
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN NaN NaN NaN NaN 1.478578 6.855404 4767 250.0 NaN
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN NaN 3.099887 14.372579 30235 600.0 75.0
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN NaN 3.966329 18.389824 30314 360.0 NaN
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... -0.060625 -0.271934 -0.060625 -0.271934 -0.211309 1.504393 6.975095 31393 640.0 NaN

5 rows × 44 columns

add town feature



In [22]:
%cd '/media/siyuan/0009E198000CD19B/bimbo/origin'
%ls


/media/siyuan/0009E198000CD19B/bimbo/origin
cliente_tabla.csv*  producto_tabla.csv*  test.csv*  town_state.csv*  train.csv*

In [23]:
cliente_tabla = pd.read_csv('cliente_tabla.csv')
town_state = pd.read_csv('town_state.csv')

town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)

train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]

cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )

cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)

cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
                                        cliente_per_town_count,on = 'town_id',how = 'left')

pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
                                                              cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
                                                              on = ['Cliente_ID','Producto_ID'],how = 'left')

In [24]:
cliente_tabla.head()


Out[24]:
Cliente_ID NombreCliente
0 0 SIN NOMBRE
1 1 OXXO XINANTECATL
2 2 SIN NOMBRE
3 3 EL MORENO
4 4 SDN SER DE ALIM CUERPO SA CIA DE INT

In [25]:
town_state.head()


Out[25]:
Agencia_ID Town State
0 1110 2008 AG. LAGO FILT MÉXICO, D.F.
1 1111 2002 AG. AZCAPOTZALCO MÉXICO, D.F.
2 1112 2004 AG. CUAUTITLAN ESTADO DE MÉXICO
3 1113 2008 AG. LAGO FILT MÉXICO, D.F.
4 1114 2029 AG.IZTAPALAPA 2 MÉXICO, D.F.

In [26]:
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)

In [27]:
town_state.head()


Out[27]:
Agencia_ID Town State town_id
0 1110 2008 AG. LAGO FILT MÉXICO, D.F. 2008
1 1111 2002 AG. AZCAPOTZALCO MÉXICO, D.F. 2002
2 1112 2004 AG. CUAUTITLAN ESTADO DE MÉXICO 2004
3 1113 2008 AG. LAGO FILT MÉXICO, D.F. 2008
4 1114 2029 AG.IZTAPALAPA 2 MÉXICO, D.F. 2029

In [13]:
pivot_train_categorical_useful_time_LR_weight.columns.values


Out[13]:
array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'ID', 'weight', 'weight_per_piece'], dtype=object)

In [34]:
train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]

In [35]:
cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )

In [36]:
cliente_per_town.head()


Out[36]:
Cliente_ID Producto_ID Agencia_ID NombreCliente town_id
0 26 1182 2061 BODEGA COMERCIAL MEXICANA TOLUCA 2175
1 26 4767 2061 BODEGA COMERCIAL MEXICANA TOLUCA 2175
2 26 31393 2061 BODEGA COMERCIAL MEXICANA TOLUCA 2175
3 26 31690 2061 BODEGA COMERCIAL MEXICANA TOLUCA 2175
4 26 32953 2061 BODEGA COMERCIAL MEXICANA TOLUCA 2175

In [37]:
cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)

In [38]:
cliente_per_town_count.head()


Out[38]:
town_id NombreCliente
0 2001 2.74188
1 2002 1.42954
2 2003 1.74497
3 2004 1.22725
4 2007 1.97018

In [39]:
cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
                                        cliente_per_town_count,on = 'town_id',how = 'left')
cliente_per_town_count_final.head()


Out[39]:
Cliente_ID Producto_ID Agencia_ID town_id NombreCliente
0 26 1182 2061 2175 0.34498
1 26 4767 2061 2175 0.34498
2 26 31393 2061 2175 0.34498
3 26 31690 2061 2175 0.34498
4 26 32953 2061 2175 0.34498

In [42]:
pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
                                                              cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
                                                              on = ['Cliente_ID','Producto_ID'],how = 'left')
pivot_train_categorical_useful_time_LR_weight_town.head()


Out[42]:
Cliente_ID Producto_ID Sem3 Sem4 Sem5 Sem6 Sem7 Sem8 Sem9 Agencia_ID ... t2_min_t5 t3_min_t4 t3_min_t5 t4_min_t5 LR_prod LR_prod_corr ID weight weight_per_piece NombreCliente
0 26 1182 3.688879 NaN NaN NaN 0.000000 0.000000 NaN 2061 ... NaN NaN NaN NaN 1.775590 8.232495 1182 210.0 210.0 0.34498
1 26 4767 3.761200 NaN NaN NaN 3.761200 3.761200 NaN 2061 ... NaN NaN NaN NaN 1.478578 6.855404 4767 250.0 NaN 0.34498
2 26 30235 NaN NaN NaN NaN NaN NaN 4.574711 2655 ... NaN NaN NaN NaN 3.099887 14.372579 30235 600.0 75.0 0.00734
3 26 30314 NaN NaN NaN NaN 3.891820 NaN NaN 2655 ... NaN NaN NaN NaN 3.966329 18.389824 30314 360.0 NaN 0.00734
4 26 31393 3.044522 2.833213 2.772589 2.772589 2.944439 3.135494 2.639057 2061 ... -0.271934 -0.060625 -0.271934 -0.211309 1.504393 6.975095 31393 640.0 NaN 0.34498

5 rows × 45 columns


In [43]:
pivot_train_categorical_useful_time_LR_weight_town.columns.values


Out[43]:
array(['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
       'Sem8', 'Sem9', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK',
       'agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'ID', 'weight', 'weight_per_piece', 'NombreCliente'], dtype=object)

begin xgboost training



In [7]:
train_pivot_xgb_time1.columns.values


Out[7]:
array(['Agencia_ID', 'Canal_ID', 'Cliente_ID', 'LR_prod', 'LR_prod_corr',
       'NombreCliente', 'Producto_ID', 'Ruta_SAK', 'Sem3_cum', 'Sem4_cum',
       'Sem5_cum', 'Sem6_cum', 'Sem7_cum', 'Sem8_cum',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't1_min_t2', 't1_min_t3', 't1_min_t4',
       't1_min_t5', 't2_min_t3', 't2_min_t4', 't2_min_t5', 't3_min_t4',
       't3_min_t5', 't4_min_t5', 't_min_1', 't_min_2', 't_min_3',
       't_min_4', 't_min_5', 'target'], dtype=object)

In [8]:
train_pivot_xgb_time1 = train_pivot_xgb_time1.drop(['Cliente_ID','Producto_ID','Agencia_ID',
                                                                            'Ruta_SAK','Canal_ID'],axis = 1)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-bf85e99ba623> in <module>()
      1 train_pivot_xgb_time1 = train_pivot_xgb_time1.drop(['Cliente_ID','Producto_ID','Agencia_ID',
----> 2                                                                             'Ruta_SAK','ID','Canal_ID'],axis = 1)

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in drop(self, labels, axis, level, inplace, errors)
   1875                 new_axis = axis.drop(labels, level=level, errors=errors)
   1876             else:
-> 1877                 new_axis = axis.drop(labels, errors=errors)
   1878             dropped = self.reindex(**{axis_name: new_axis})
   1879             try:

/usr/local/lib/python2.7/dist-packages/pandas/indexes/base.pyc in drop(self, labels, errors)
   3049             if errors != 'ignore':
   3050                 raise ValueError('labels %s not contained in axis' %
-> 3051                                  labels[mask])
   3052             indexer = indexer[~mask]
   3053         return self.delete(indexer)

ValueError: labels ['ID'] not contained in axis

In [45]:
pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem8'].notnull()]
# pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem9'].notnull()]

In [46]:
pivot_train_categorical_useful_train_time_no_nan_sample = pivot_train_categorical_useful_train_time_no_nan.sample(1000000)

In [47]:
train_feature = pivot_train_categorical_useful_train_time_no_nan_sample.drop(['Sem8','Sem9'],axis = 1)
train_label = pivot_train_categorical_useful_train_time_no_nan_sample[['Sem8','Sem9']]

In [9]:
#seperate train and test data
# datasource: sparse_week_Agencia_Canal_Ruta_normalized_csr label:train_label
%time train_set, valid_set, train_labels, valid_labels = train_test_split(train_feature,\
                                                                                  train_label, test_size=0.10)


CPU times: user 736 ms, sys: 36 ms, total: 772 ms
Wall time: 891 ms

In [48]:
# dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)
dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)

In [49]:
param = {'booster':'gbtree',
         'nthread': 7,
         'max_depth':6, 
         'eta':0.2,
         'silent':0,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}
# param = {'eta':0.1, 'eval_metric':'rmse','nthread': 8}


# evallist  = [(dvalid,'eval'), (dtrain,'train')]

In [ ]:
num_round = 1000
# plst = param.items()

# bst = xgb.train( plst, dtrain, num_round, evallist )
cvresult = xgb.cv(param, dtrain, num_round, nfold=5,show_progress=True,show_stdv=False,
                        seed = 0, early_stopping_rounds=10)
print(cvresult.tail())

for 1 week later


  • cv rmse 0.451181 with dummy canal, time regr,
  • cv rmse 0.450972 without dummy canal, time regr,
  • cv rmse 0.4485676 without dummy canal, time regr, producto info
  • cv rmse 0.4487434 without dummy canal, time regr, producto info, cliente_per_town

for 2 week later


  • cv rmse 0.4513236 without dummy canal, time regr, producto info

In [51]:
# xgb.plot_importance(cvresult)

In [ ]: