notebook.community

Edit and run



In [5]:

    
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg



In [6]:

    
'''钢炮'''
path = '/home/zongyi/bimbo_data/'



In [7]:

    
train = gl.SFrame.read_csv(path + 'train_lag5_w8_mean.csv', verbose=False)



In [9]:

    
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']



In [10]:

    
del train['prior_sum']
del train['lag_sum']
del train['week_times']
del train['Semana']
del train['Canal_ID']



In [6]:

    
# town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
# train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
# train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
# del train['Town']



In [7]:

    
# relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
# train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
# train = train.fillna('re_lag1',0)
# train = train.fillna('re_lag2',0)
# train = train.fillna('re_lag3',0)
# train = train.fillna('re_lag4',0)
# train = train.fillna('re_lag5',0)
# del relag_train



In [8]:

    
# pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
# train = train.join(pd, on=['Producto_ID'], how='left')
# train = train.fillna('prom',0)
# train = train.fillna('weight',0)
# train = train.fillna('pieces',1)
# train = train.fillna('w_per_piece',0)
# train = train.fillna('healthy',0)
# train = train.fillna('drink',0)
# del train['brand']
# del train['NombreProducto']
# del pd



In [9]:

    
# client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
# train = train.join(client, on=['Cliente_ID'], how='left')
# del client



In [11]:

    
train.head()









    Out[11]:





    
        Agencia_ID
        Ruta_SAK
        Cliente_ID
        Producto_ID
        Demada_log
        lag1
        lag2
        lag3
        lag4
        lag5
    
    
        1542
        4501
        4676035
        1242
        1.38629
        1.38629
        1.60944
        1.09861
        1.60944
        1.38629
    
    
        1138
        1033
        171033
        1146
        1.94591
        2.19722
        1.38629
        2.07944
        2.07944
        2.19722
    
    
        2238
        4105
        494132
        40217
        1.60944
        2.19722
        2.30259
        0.0
        0.0
        2.48491
    
    
        1350
        1233
        2030945
        1232
        1.09861
        1.09861
        1.09861
        0.693147
        1.09861
        0.693147
    
    
        1236
        1203
        85081
        1182
        1.94591
        0.0
        0.0
        0.0
        0.0
        0.0
    
    
        1629
        1020
        4165785
        1182
        2.30259
        2.19722
        2.19722
        2.70805
        2.3979
        2.19722
    
    
        1453
        1107
        10642
        43207
        3.43399
        3.3673
        2.89037
        3.58352
        2.77259
        3.04452
    
    
        2038
        2814
        1224421
        36304
        1.09861
        0.0
        0.0
        0.0
        0.0
        0.0
    
    
        1618
        1261
        858571
        2665
        0.693147
        0.0
        0.0
        0.0
        0.0
        0.0
    
    
        1120
        2113
        174432
        30532
        1.60944
        1.09861
        1.94591
        0.0
        0.0
        2.30259
    


    
        n_a
        n_r
        n_c
        n_p
        mpca
    
    
        4920.33
        17705.3
        17.0
        150460.0
        1.38576
    
    
        27322.7
        8605.67
        25.6667
        117711.0
        1.78838
    
    
        9321.67
        1760.0
        10.3333
        9504.67
        2.13936
    
    
        15862.7
        14950.0
        5.0
        108023.0
        0.943928
    
    
        30028.7
        32879.3
        13.3333
        52275.3
        1.37841
    
    
        40219.0
        10540.0
        20.3333
        52275.3
        2.2048
    
    
        20021.3
        17868.3
        23.0
        25734.0
        2.94955
    
    
        13577.7
        9520.33
        8.0
        4284.33
        1.07856
    
    
        18915.7
        13758.3
        11.3333
        24787.7
        1.04317
    
    
        36187.3
        14982.7
        14.6667
        90304.3
        1.79548
    

[10 rows x 15 columns]



In [ ]:

    
# Make a train-test split
# train_data, test_data = train.random_split(0.999)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=1200,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=369,
                                          column_subsample=0.7,
                                          row_subsample=0.85,
                                          validation_set=None,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=1200)









    




Boosted trees regression:






    




--------------------------------------------------------






    




Number of examples          : 10406868






    




Number of features          : 14






    




Number of unpacked features : 14






    




+-----------+--------------+---------------+






    




| Iteration | Elapsed Time | Training-rmse |






    




+-----------+--------------+---------------+






    




| 1         | 27.983087    | 1.263969      |






    




| 2         | 55.050844    | 1.155445      |






    




| 3         | 81.809064    | 1.059701      |






    




| 4         | 105.795573   | 0.977373      |



In [ ]:

    
'''feature important'''
w = model.get_feature_importance()
w = w.add_row_number()
w



In [ ]:

    
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(12, 6)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)

predict



In [ ]:

    
test = gl.SFrame.read_csv(path + 'test_lag5_w8_mean.csv', verbose=False)
# test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# del test['Town']
# test = test.fillna('t_c',1)
# test = test.fillna('tcc',0)
# test = test.fillna('tp_sum',0)



In [ ]:

    
del test['prior_sum']
del test['lag_sum']
del test['week_times']
del test['Canal_ID']



In [ ]:

    
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    return test_full



In [ ]:

    
test_full = feature_w11(test, lag_sum=0, prior_sum=0)

ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})



In [ ]:

    
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))



In [ ]:

    
file_name = 'w8'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path + file_name,format='csv')



In [19]:

    
sub









    Out[19]:





    
        Demanda_uni_equil
        id
    
    
        10.2759945106
        1241139
    
    
        1.55656712045
        446660
    
    
        3.624115765
        699
    
    
        5.22596714568
        3563354
    
    
        0.853487732775
        4321974
    
    
        14.59106358
        2223933
    
    
        3.44766689475
        5868253
    
    
        1.09161861938
        2800590
    
    
        8.85764549884
        5720532
    
    
        2.96202633523
        761533
    

[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [ ]:

Agencia_ID	Ruta_SAK	Cliente_ID	Producto_ID	Demada_log	lag1	lag2	lag3	lag4	lag5
1542	4501	4676035	1242	1.38629	1.38629	1.60944	1.09861	1.60944	1.38629
1138	1033	171033	1146	1.94591	2.19722	1.38629	2.07944	2.07944	2.19722
2238	4105	494132	40217	1.60944	2.19722	2.30259	0.0	0.0	2.48491
1350	1233	2030945	1232	1.09861	1.09861	1.09861	0.693147	1.09861	0.693147
1236	1203	85081	1182	1.94591	0.0	0.0	0.0	0.0	0.0
1629	1020	4165785	1182	2.30259	2.19722	2.19722	2.70805	2.3979	2.19722
1453	1107	10642	43207	3.43399	3.3673	2.89037	3.58352	2.77259	3.04452
2038	2814	1224421	36304	1.09861	0.0	0.0	0.0	0.0	0.0
1618	1261	858571	2665	0.693147	0.0	0.0	0.0	0.0	0.0
1120	2113	174432	30532	1.60944	1.09861	1.94591	0.0	0.0	2.30259

n_a	n_r	n_c	n_p	mpca
4920.33	17705.3	17.0	150460.0	1.38576
27322.7	8605.67	25.6667	117711.0	1.78838
9321.67	1760.0	10.3333	9504.67	2.13936
15862.7	14950.0	5.0	108023.0	0.943928
30028.7	32879.3	13.3333	52275.3	1.37841
40219.0	10540.0	20.3333	52275.3	2.2048
20021.3	17868.3	23.0	25734.0	2.94955
13577.7	9520.33	8.0	4284.33	1.07856
18915.7	13758.3	11.3333	24787.7	1.04317
36187.3	14982.7	14.6667	90304.3	1.79548

Demanda_uni_equil	id
10.2759945106	1241139
1.55656712045	446660
3.624115765	699
5.22596714568	3563354
0.853487732775	4321974
14.59106358	2223933
3.44766689475	5868253
1.09161861938	2800590
8.85764549884	5720532
2.96202633523	761533