In [5]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg

In [6]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'

In [7]:
train = gl.SFrame.read_csv(path + 'train_lag5_w8_mean.csv', verbose=False)

In [9]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']

In [10]:
del train['prior_sum']
del train['lag_sum']
del train['week_times']
del train['Semana']
del train['Canal_ID']

In [6]:
# town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
# train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
# train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
# del train['Town']

In [7]:
# relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
# train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
# train = train.fillna('re_lag1',0)
# train = train.fillna('re_lag2',0)
# train = train.fillna('re_lag3',0)
# train = train.fillna('re_lag4',0)
# train = train.fillna('re_lag5',0)
# del relag_train

In [8]:
# pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
# train = train.join(pd, on=['Producto_ID'], how='left')
# train = train.fillna('prom',0)
# train = train.fillna('weight',0)
# train = train.fillna('pieces',1)
# train = train.fillna('w_per_piece',0)
# train = train.fillna('healthy',0)
# train = train.fillna('drink',0)
# del train['brand']
# del train['NombreProducto']
# del pd

In [9]:
# client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
# train = train.join(client, on=['Cliente_ID'], how='left')
# del client

In [11]:
train.head()


Out[11]:
Agencia_ID Ruta_SAK Cliente_ID Producto_ID Demada_log lag1 lag2 lag3 lag4 lag5
1542 4501 4676035 1242 1.38629 1.38629 1.60944 1.09861 1.60944 1.38629
1138 1033 171033 1146 1.94591 2.19722 1.38629 2.07944 2.07944 2.19722
2238 4105 494132 40217 1.60944 2.19722 2.30259 0.0 0.0 2.48491
1350 1233 2030945 1232 1.09861 1.09861 1.09861 0.693147 1.09861 0.693147
1236 1203 85081 1182 1.94591 0.0 0.0 0.0 0.0 0.0
1629 1020 4165785 1182 2.30259 2.19722 2.19722 2.70805 2.3979 2.19722
1453 1107 10642 43207 3.43399 3.3673 2.89037 3.58352 2.77259 3.04452
2038 2814 1224421 36304 1.09861 0.0 0.0 0.0 0.0 0.0
1618 1261 858571 2665 0.693147 0.0 0.0 0.0 0.0 0.0
1120 2113 174432 30532 1.60944 1.09861 1.94591 0.0 0.0 2.30259
n_a n_r n_c n_p mpca
4920.33 17705.3 17.0 150460.0 1.38576
27322.7 8605.67 25.6667 117711.0 1.78838
9321.67 1760.0 10.3333 9504.67 2.13936
15862.7 14950.0 5.0 108023.0 0.943928
30028.7 32879.3 13.3333 52275.3 1.37841
40219.0 10540.0 20.3333 52275.3 2.2048
20021.3 17868.3 23.0 25734.0 2.94955
13577.7 9520.33 8.0 4284.33 1.07856
18915.7 13758.3 11.3333 24787.7 1.04317
36187.3 14982.7 14.6667 90304.3 1.79548
[10 rows x 15 columns]

In [ ]:
# Make a train-test split
# train_data, test_data = train.random_split(0.999)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=1200,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=369,
                                          column_subsample=0.7,
                                          row_subsample=0.85,
                                          validation_set=None,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=1200)


Boosted trees regression:
--------------------------------------------------------
Number of examples          : 10406868
Number of features          : 14
Number of unpacked features : 14
+-----------+--------------+---------------+
| Iteration | Elapsed Time | Training-rmse |
+-----------+--------------+---------------+
| 1         | 27.983087    | 1.263969      |
| 2         | 55.050844    | 1.155445      |
| 3         | 81.809064    | 1.059701      |
| 4         | 105.795573   | 0.977373      |

In [ ]:
'''feature important'''
w = model.get_feature_importance()
w = w.add_row_number()
w

In [ ]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(12, 6)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)

predict


In [ ]:
test = gl.SFrame.read_csv(path + 'test_lag5_w8_mean.csv', verbose=False)
# test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# del test['Town']
# test = test.fillna('t_c',1)
# test = test.fillna('tcc',0)
# test = test.fillna('tp_sum',0)

In [ ]:
del test['prior_sum']
del test['lag_sum']
del test['week_times']
del test['Canal_ID']

In [ ]:
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    return test_full

In [ ]:
test_full = feature_w11(test, lag_sum=0, prior_sum=0)

ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})

In [ ]:
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))

In [ ]:
file_name = 'w8'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path + file_name,format='csv')

In [19]:
sub


Out[19]:
Demanda_uni_equil id
10.2759945106 1241139
1.55656712045 446660
3.624115765 699
5.22596714568 3563354
0.853487732775 4321974
14.59106358 2223933
3.44766689475 5868253
1.09161861938 2800590
8.85764549884 5720532
2.96202633523 761533
[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [ ]: