notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import random
import re
from scipy import sparse
from sklearn import preprocessing
from sklearn import utils
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
import luigi



In [3]:

    
def readin_train(path) :
    """read the csv file in
    
    Args :
        path : project path
        nrows : number of rows to read in
    
    Returns:
        dataset
        
    """
    dtypes = {'Semana' : 'int32',
              'Agencia_ID' :'int32',
              'Canal_ID' : 'int32',
              'Ruta_SAK' : 'int32',
              'Cliente-ID' : 'int32',
              'Producto_ID':'int32',
              'Venta_hoy':'float32',
              'Venta_uni_hoy': 'int32',
              'Dev_uni_proxima':'int32',
              'Dev_proxima':'float32',
              'Demanda_uni_equil':'int32'}
    
    
    names_dict = {"Demanda_uni_equil" : "label"}
    train_dataset = pd.read_csv(path + "/data/interim/train_sample.csv",
                                usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID','Demanda_uni_equil'],
                                dtype  = dtypes)
                                
    train_dataset.rename(columns = names_dict, inplace = True)
    return train_dataset



In [4]:

    
def readin_test(path) :
    """read the csv file in
    
    Args :
        path : project path
        nrows : number of rows to read in
    
    Returns:
        dataset
        
    """
    
    dtypes = {'Semana' : 'int32',
              'Agencia_ID' :'int32',
              'Canal_ID' : 'int32',
              'Ruta_SAK' : 'int32',
              'Cliente-ID' : 'int32',
              'Producto_ID':'int32',
              'Venta_hoy':'float32',
              'Venta_uni_hoy': 'int32',
              'Dev_uni_proxima':'int32',
              'Dev_proxima':'float32',
              'Demanda_uni_equil':'int32'}
    
    test_dataset = pd.read_csv(path + "/data/interim/test_sample.csv",
                                usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID'],
                                dtype  = dtypes)
    return test_dataset



In [5]:

    
def pivot_table(df):
    """pivot the table, for each (client, product) pair, it should show week3 to week 9 demand
    
    Week3 : product1, client1, demand
    Week4 : product1, client1, demand
    Week5 : product1, client1, demand
    Week6 : product1, client1, demand
    Week7 : product1, client1, demand
    Week8 : product1, client1, demand
    
    To
    
    product1, client1 : week3_demand, week4_demand, week5_demand, week6_demand, week7_demand, week8_demand
    
    Args :
        Origin training dataset
    
    Returns :
        Showed above
    """
    df["label"] = df["label"].apply(np.log1p)
    pivot_df = pd.pivot_table(data = df[["Semana","Cliente_ID", "Producto_ID", "label"]],
                              values = "label", index = ["Cliente_ID", "Producto_ID"],
                              columns = ["Semana"],  aggfunc= np.mean).reset_index()
    
    pivot_df.columns = ["label_" + str(ele) if str(ele).isdigit() else ele for ele in pivot_df.columns.values]
    return pivot_df



In [6]:

    
def gen_se_dist(df):
    """generate statistical distribution value for Producto_ID(product_id) and Cliente_ID(client_id) pair
    
    The main issue of this dataset is most features are in nominal format, ex. ID
    So it's better to generate some numerical distribution value for reasonble nominal feature pair    
    
    Args : train_dataset
    
    Returns : dist info provided by train_dataset
    """
    df_se_pro_cli_dist = df.groupby(["Cliente_ID","Producto_ID"],as_index = False).\
                        agg(['count','sum', 'min', 'max','median','mean']).reset_index()
    df_se_pro_cli_dist.columns = ["_".join(ele) + "_spc" if ele[1] != "" 
                               else ele[0] for ele in df_se_pro_cli_dist.columns.values]
        
    df_se_pro_dist = df.drop("Cliente_ID",1).groupby(["Producto_ID"],as_index = False).\
                        agg(['count','sum', 'min', 'max','median','mean']).reset_index()
    df_se_pro_dist.columns = ["_".join(ele) + "_sp" if ele[1] != "" 
                               else ele[0] for ele in df_se_pro_dist.columns.values]
        
    df_se_cli_dist = df.drop("Producto_ID",1).groupby(["Cliente_ID"],as_index = False).\
                        agg(['count','sum', 'min', 'max','median','mean']).reset_index()
    df_se_cli_dist.columns = ["_".join(ele) + "_sc" if ele[1] != "" 
                           else ele[0] for ele in df_se_cli_dist.columns.values]
        
    return df_se_pro_cli_dist, df_se_pro_dist, df_se_cli_dist



In [7]:

    
def gen_pro_cli_dist(df):
    df_cli_pro_dist = df[["Cliente_ID","Producto_ID",'label']].\
                 groupby(["Cliente_ID","Producto_ID"],as_index = False).\
                 agg(['count','sum', 'min', 'max','median','mean']).reset_index()
    df_cli_pro_dist.columns = ["_".join(ele) + "_cp" if ele[1] != "" 
                       else ele[0] for ele in df_cli_pro_dist.columns.values]
    return df_cli_pro_dist



In [8]:

    
def merge_df_dist(df, dist):
    """merge the dist info with train_dataset or test_dataset"""
    dist_join_key = [ele for ele in dist.columns if ele.split("_")[0] != "label"]
    df = df.merge(dist, how = 'left', on = dist_join_key)
    return df



In [9]:

    
def create_lag(df):
    """create time lag between demand among each week
    
    The week10 is held out data and week11 is the data we need to predict
    So for week10 we create lag1 and lag2 between week3 ~ week8  
    So for week11 we create lag1 and lag2 between week4 ~ week9
    
    It should be after pivot_table when training and test data have already been merged with dist infomation
    
    Args : 
        training data or test data already merged with dist info
    """
    periods = [(3,8), (4,9)]
    lag_time = [1, 2]
    
    for period in periods:
        for lag in lag_time:
            for index in range(period[0], period[1] + 1 - lag):
                df['label_' + str(index + lag) + '_min_' + str(index )] = df["label_" + str(index + lag)].subtract(df["label_" + str(index)])
    return df



In [10]:

    
def external_info(path):
    """add additional information from /external, this data were provided by Kaggle
    
    External Source name :
        cliente_tabla
        producto_tabla
        town_tabla
    
    """
    producto_tabla = pd.read_csv(path + "/data/external/producto_tabla.csv")
    town_state = pd.read_csv(path + "/data/external/town_state.csv")
    
    town_state['town_id'] = town_state['Town'].str.split()
    town_state['town_id'] = town_state['Town'].str.split(expand = True)
    town_state = pd.concat([town_state, pd.get_dummies(town_state[["town_id","State"]], prefix=['town_id', 'state_id'])],axis = 1)
    town_state.drop(["Town", "State", "town_id"], axis = 1, inplace = True)
    
    reg_weight = r" (\d{1,10})g "
    reg_piece = r" (\d{1,10})p "
    producto_tabla["weight"] = producto_tabla["NombreProducto"].apply(lambda x: re.findall(reg_weight, x)[0] if re.search(reg_weight, x) else np.nan)
    producto_tabla["piece"] = producto_tabla["NombreProducto"].apply(lambda x: re.findall(reg_piece, x)[0] if re.search(reg_piece, x) else np.nan)
    producto_tabla["wei_per_piece"] = producto_tabla["weight"].astype(float).divide(producto_tabla["piece"].astype(float))
    producto_tabla.drop("NombreProducto", axis = 1, inplace = True)
    
    return town_state, producto_tabla



In [33]:

    
def gen_dist_train_test(train_df, test_df, pivot_table, gen_se_dist, gen_pro_cli_dist, external_info):
    """generate dist information on training data, merge the distribution with both training data and test data
    
    The Data flow should look like this:
        train_df ==> pivot_table ==> gen_se_dist ====> get_dist_info ==> merge info(train and test)
                |               |                   |
                |               |                   |
                |                ==> create_lag ====|
                |                                   |
                |                                   |
                 ==> gen_pro_cli_dist ==============|
                |                                   |
                |                                   |
                 ==> external_info =================|

    So after these process, the train_df.shape[0] will decrease, while test_df.shape[0] will not change
    """
    
    df_cli_pro_dist = gen_pro_cli_dist(train_df)
    
    train_pivot = pivot_table(train_df)
    df_se_pro_cli_dist, df_se_pro_dist, df_se_cli_dist = gen_se_dist(train_pivot)
    df_lag = create_lag(train_pivot)

    dist_list = [df_cli_pro_dist, df_lag, df_se_pro_cli_dist, df_se_pro_dist, df_se_cli_dist]
    
    for dist in dist_list:
        dist_join_key = [ele for ele in dist.columns if ele.split("_")[0] != "label"]
        train_df = train_df.merge(dist, on = dist_join_key, how = 'left')
        test_df = test_df.merge(dist, on = dist_join_key, how = 'left')
    
    
    town_state, producto_tabla = external_info(path)
    
    train_df = train_df.merge(town_state, on = "Agencia_ID", how = 'left')
    train_df = train_df.merge(producto_tabla, on = "Producto_ID", how = 'left')
    
    test_df = test_df.merge(town_state, on = "Agencia_ID", how = 'left')
    test_df = test_df.merge(producto_tabla, on = "Producto_ID", how = 'left')
    
    train_df.drop(["Semana", "Agencia_ID", "Canal_ID", "Ruta_SAK", "Cliente_ID", "Producto_ID", "label"], axis = 1, inplace = True)
    train_df.drop_duplicates(inplace = True)
    
    test_df.drop(["Semana", "Agencia_ID", "Canal_ID", "Ruta_SAK", "Cliente_ID", "Producto_ID"], axis = 1, inplace = True)
        
    return train_df, test_df



In [34]:

    
def filter_name(column_name, type_name):
    """filter name for train and test data for model
    
    Args :
        column_name : colname for training_df and test_df
        type_name : can either be "train", "test", "hold"
    
    Returns:
        filtered column based on type
    """
    filtered_name = []
    if type_name == "train":
        # 8, 9 can be shown in featurem label is 9
        for name in column_name:
            if str(8) in name.split("_") or str(9) in name.split("_"):
                pass
            else:
                filtered_name.append(name)
    elif type_name == "test":
        # 3, 9 can be shown in featurem label is 10(label on Kaggle, so no local label)
        for name in column_name:
            if str(3) in name.split("_") or str(9) in name.split("_"):
                pass
            else:
                filtered_name.append(name)
    else:
        # 3, 4 can be shown in featurem label is 11(label on Kaggle, so no local label)
        for name in column_name:
            if str(3) in name.split("_") or str(4) in name.split("_"):
                pass
            else:
                filtered_name.append(name)
    
    return filtered_name



In [35]:

    
def prepare_train_hold_test(train_df, test_df):
    """based on the specifity on this problem, we should split train test data based on time factor
    
    training_data : week3, week4, week5, week6, week7 ==> week9
    hold_out_data : week4, week5, week6, week7, week8 ==> week10 (get result from Kaggle public board)
    test_data : week5, week6, week7, week8, week9 ==> week11 (get result from Kaggle private board)
    
    generate (training_data, train_label), hold_out_data and test_data
    """
    train_data = train_df[filter_name(train_df.columns.values, "train")]
    train_label = train_df["label_9"]
    
    hold_data = train_df[filter_name(train_df.columns.values, "hold")]
    test_data = test_df[filter_name(test_df.columns.values, "test")]
    
    return train_data, train_label, hold_data, test_data



In [41]:

    
def prepare_whole():
    """prepare for the train_df and test_df"""
    path = "~/bimbo_kaggle_competition"
    train_dataset = readin_train(path)
    test_dataset = readin_test(path)
    train_dataset, test_dataset = gen_dist_train_test(train_dataset, test_dataset, 
                                                      pivot_table, 
                                                      gen_se_dist, 
                                                      gen_pro_cli_dist,
                                                      external_info)
    train_data, train_label, hold_data, test_data = prepare_train_hold_test(train_dataset, test_dataset)
    
    train_data.to_csv(path + "/data/processed/train.csv", index = False)
    train_label.to_csv(path + "/data/processed/train_label.csv", index = False)
    hold_data.to_csv(path + "/data/processed/hold.csv", index = False)
    test_data.to_csv(path + "/data/processed/test_label.csv", index = False)
    
    print "Data preparation is done"
    print "%s of features is created" %train_data.shape[0]
    
    print "training data size : %s" %train_data.shape[1]
    print "holdout data size : %s" %hold_data.shape[1]
    print "test data size : %s" %test_data.shape[1]



In [42]:

    
train_data, train_label, hold_data, test_data = prepare_whole()



In [43]:

    
train_data.head()









    Out[43]:






  
    
      
      label_count_cp
      label_sum_cp
      label_min_cp
      label_max_cp
      label_median_cp
      label_mean_cp
      label_3
      label_4
      label_5
      label_6
      ...
      state_id_SONORA
      state_id_TABASCO
      state_id_TAMAULIPAS
      state_id_TLAXCALA
      state_id_VERACRUZ
      state_id_YUCATÁN
      state_id_ZACATECAS
      weight
      piece
      wei_per_piece
    
  
  
    
      0
      1
      1
      1
      1
      1.0
      1.0
      NaN
      NaN
      NaN
      NaN
      ...
      0
      0
      0
      0
      0
      0
      0
      540
      NaN
      NaN
    
    
      1
      1
      9
      9
      9
      9.0
      9.0
      NaN
      NaN
      NaN
      NaN
      ...
      0
      0
      0
      0
      0
      0
      0
      62
      1
      62.0
    
    
      2
      1
      2
      2
      2
      2.0
      2.0
      1.098612
      NaN
      NaN
      NaN
      ...
      0
      0
      0
      0
      0
      0
      0
      123
      NaN
      NaN
    
    
      3
      1
      1
      1
      1
      1.0
      1.0
      NaN
      NaN
      NaN
      0.693147
      ...
      0
      0
      0
      0
      0
      0
      0
      255
      NaN
      NaN
    
    
      4
      1
      2
      2
      2
      2.0
      2.0
      NaN
      NaN
      NaN
      NaN
      ...
      0
      0
      0
      0
      0
      0
      0
      360
      NaN
      NaN
    
  

5 rows × 401 columns



In [46]:

    
test_data.shape









    Out[46]:





(100000, 401)



In [ ]:

	label_count_cp	label_sum_cp	label_min_cp	label_max_cp	label_median_cp	label_mean_cp	label_3	label_4	label_5	label_6	...	weight	piece	wei_per_piece
0	1	1	1	1	1.0	1.0	NaN	NaN	NaN	NaN	...	540	NaN	NaN
1	1	9	9	9	9.0	9.0	NaN	NaN	NaN	NaN	...	62	1	62.0
2	1	2	2	2	2.0	2.0	1.098612	NaN	NaN	NaN	...	123	NaN	NaN
3	1	1	1	1	1.0	1.0	NaN	NaN	NaN	0.693147	...	255	NaN	NaN
4	1	2	2	2	2.0	2.0	NaN	NaN	NaN	NaN	...	360	NaN	NaN