In [2]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
from graphlab import mxnet as mx

In [3]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'

In [13]:
train = gl.SFrame.read_csv(path + 'train_lag5_w9_mean.csv', verbose=False)

In [ ]:


In [ ]:
# Define the network symbol, equivalent to linear regression
net = mx.symbol.Variable('data')
net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=1)
net = mx.symbol.LinearRegressionOutput(data=net, name='lr')

# Load data into SFrame and normalize features
# sf = gl.SFrame.read_csv('https://static.turi.com/datasets/regression/houses.csv')
# features = ['tax', 'bedroom', 'bath', 'size', 'lot']
features =  ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'lag1','lag2','lag3','lag4','lag5','n_a','n_r','n_c','n_p','mpca']
# for f in features:
#     sf[f] = sf[f] - sf[f].mean()
#     sf[f] = sf[f] / sf[f].std()

# Prepare the input iterator from SFrame
# `data_name` must match the first layer's name of the network.
# `label_name` must match the last layer's name plus "_label".
dataiter = mx.io.SFrameIter(train, data_field=features, label_field='Demada_log',
                          data_name='data', label_name='lr_label',
                          batch_size=1)

# Train the network
model = mx.model.FeedForward.create(symbol=net, X=dataiter, num_epoch=20,
                                  learning_rate=1e-2,
                                  eval_metric='rmse')

# Make prediction
model.predict(dataiter)


[INFO] graphlab.mxnet.model: Start training with [cpu(0)]

In [4]:
# town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
# train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
# train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
# del train['Town']
# del train['t_c']

In [14]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']

In [15]:
del train['prior_sum']
del train['lag_sum']
del train['week_times']
del train['Semana']
del train['Canal_ID']
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
# train = train.fillna('lag_sum',0)
# train = train.fillna('prior_sum',0)
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
train


Out[15]:
Agencia_ID Ruta_SAK Cliente_ID Producto_ID Demada_log lag1 lag2 lag3 lag4 lag5
2234 1234 2308240 1240 2.19722 1.79176 0.693147 1.94591 0.0 0.0
1121 1418 169387 1240 1.38629 1.60944 2.3979 1.38629 0.0 0.0
3215 4435 1280994 43285 1.79176 2.3979 1.79176 2.70805 2.3979 2.83321
1122 1456 80730 1220 1.94591 2.19722 1.94591 1.09861 2.3979 1.94591
2237 1040 764465 2233 2.30259 2.48491 2.07944 2.3979 1.09861 2.30259
2271 3218 76576 1700 3.04452 2.83321 2.48491 2.83321 3.04452 3.04452
1223 4453 168981 37058 2.07944 1.09861 1.09861 1.38629 1.60944 1.79176
1463 1252 691747 34213 1.09861 1.09861 1.09861 1.09861 0.693147 0.693147
4049 1133 4206501 43201 0.693147 0.693147 1.09861 1.09861 0.693147 0.693147
1616 1223 2237452 2233 0.693147 0.693147 0.693147 0.693147 0.0 0.0
n_a n_r n_c n_p mpca
11387.0 11333.0 9.33333 142714.0 1.4205
42768.0 6322.33 14.6667 142714.0 1.73313
16032.3 1144.33 25.6667 98334.7 2.37015
35329.7 2842.67 7.33333 66750.7 1.78089
37287.3 6096.33 5.0 161730.0 2.01048
1207.33 698.333 8.33333 2861.0 2.70704
33960.0 3929.67 29.6667 67764.3 1.45274
28276.3 18824.7 5.0 18067.3 0.961918
27298.3 10323.0 9.66667 15866.7 0.905393
25933.7 21958.3 4.0 161730.0 0.819517
[10408713 rows x 15 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [5]:
# Make a train-test split
train_data, test_data = train.random_split(0.99,seed=788)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=500,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=78,
                                          column_subsample=0.6,
                                          row_subsample=0.85,
                                          validation_set=test_data,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=500,
                                          resume_from_checkpoint=path+'model_checkpoint_1000_w9')

#500       | 14060.836344 | 0.435407      | 0.447519


WARNING: Detected extremely low variance for feature(s) 'Semana' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Resuming from checkpoint at /home/zongyi/bimbo_data/model_checkpoint_1000_w9
Boosted trees regression:
--------------------------------------------------------
Number of examples          : 10408713
Number of features          : 24
Number of unpacked features : 24
Warning: ignoring provided value of column_subsample which is different from the model checkpoint
Warning: ignoring provided value of max_iterations which is different from the model checkpoint
Warning: ignoring provided value of model_checkpoint_interval which is different from the model checkpoint
Warning: ignoring provided value of random_seed which is different from the model checkpoint
Resumed training from checkpoint at iteration 1000 which is greater than or equal to max_iterations 1000

In [43]:
w = model.get_feature_importance()
w = w.add_row_number()
w


Out[43]:
id name index count
0 Cliente_ID None 37954
1 Ruta_SAK None 37902
2 n_c None 27015
3 n_r None 26638
4 Producto_ID None 25591
5 tp_sum None 24838
6 Agencia_ID None 24514
7 n_p None 24358
8 prior_sum None 22736
9 lag_sum None 22532
[17 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [44]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(16, 9)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)


Out[44]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 <a list of 17 Text xticklabel objects>)

In [20]:
# Save predictions to an SArray
predictions = model.predict(train)

# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
print results


{'max_error': 6.300516724586487, 'rmse': 0.4389403189567331}

predict


In [88]:
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    del test_full['Semana']
    return test_full

In [87]:
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
del test['Town']
del test['t_c']
del test['n_t']

In [89]:
# del test['prior_sum']
# del test['lag_sum']
del test['week_times']
# del test['Semana']
del test['Canal_ID']
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test = test.fillna('lag_sum',0)
test = test.fillna('prior_sum',0)
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
print test.head()


+---------+--------+------------+----------+------------+-------------+---------+
|    id   | Semana | Agencia_ID | Ruta_SAK | Cliente_ID | Producto_ID |   lag1  |
+---------+--------+------------+----------+------------+-------------+---------+
| 4721633 |   10   |    4037    |   1153   |  2398808   |    43202    |   0.0   |
| 6035419 |   10   |    2239    |   1157   |  1529806   |     1230    | 1.09861 |
| 3462602 |   10   |    1366    |   1051   |  1368806   |     1146    | 1.09861 |
|  855102 |   10   |    1911    |   1156   |   867160   |     1232    | 1.38629 |
| 3242700 |   11   |    1952    |   5721   |  1126647   |    47612    |   0.0   |
| 2421613 |   10   |    2653    |   2117   |   954439   |    35305    | 2.77259 |
| 1573296 |   10   |    1427    |   1102   |  4561415   |    43197    | 1.38629 |
| 1619375 |   10   |    1629    |   1013   |   169508   |     1146    | 2.07944 |
| 3539201 |   11   |    1120    |   1453   |  4602755   |     1230    |   0.0   |
| 1631781 |   10   |    1385    |   6617   |   20130    |     2025    | 2.56495 |
+---------+--------+------------+----------+------------+-------------+---------+
+----------+----------+----------+----------+---------+-----------+---------+
|   lag2   |   lag3   |   lag4   |   lag5   | lag_sum | prior_sum |   n_a   |
+----------+----------+----------+----------+---------+-----------+---------+
|   0.0    | 1.60944  | 1.94591  | 1.38629  | 4.94164 |  4.94164  | 34667.0 |
| 1.38629  |   0.0    |   0.0    |   0.0    | 2.48491 |  3.17805  | 15941.0 |
| 0.693147 | 0.693147 | 1.38629  | 0.693147 | 4.56435 |   5.2575  | 10191.7 |
|   0.0    |   0.0    | 0.693147 | 1.38629  | 3.46574 |  6.68461  | 63002.3 |
| 3.09104  | 2.99573  | 2.07944  |   0.0    | 8.16622 |  8.16622  | 12083.3 |
| 3.04452  | 2.56495  |   0.0    |   0.0    | 8.38206 |  8.38206  | 35352.0 |
| 1.79176  | 1.94591  | 1.94591  | 1.94591  | 9.01578 |  10.9617  | 15958.0 |
| 2.19722  | 1.79176  | 2.07944  | 2.19722  | 10.3451 |  14.4394  | 39959.7 |
| 1.38629  | 1.38629  | 1.09861  | 1.38629  |  5.2575 |  6.35611  | 36688.3 |
|   0.0    | 2.56495  | 2.48491  | 2.07944  | 9.69425 |  14.7441  | 3128.33 |
+----------+----------+----------+----------+---------+-----------+---------+
+---------+---------+----------+------+---------+
|   n_r   |   n_c   |   n_p    | tcc  |  tp_sum |
+---------+---------+----------+------+---------+
| 14727.7 | 18.6667 | 26207.7  | 5437 | 22072.1 |
| 12876.3 |   19.0  | 48687.7  | 2239 | 7288.35 |
| 12811.0 | 7.66667 | 117553.0 | 1275 | 6922.05 |
| 14546.7 |   28.0  | 107057.0 | 8302 | 18695.9 |
| 250.667 |   3.0   |  922.0   | 1598 | 1831.61 |
| 6104.33 | 8.66667 | 119140.0 | 4757 | 9574.76 |
| 26384.0 |   13.0  | 23312.3  | 2526 | 10830.0 |
| 15596.7 | 31.6667 | 117553.0 | 5960 | 15834.2 |
|  2989.0 |   27.0  | 48687.7  | 5094 | 7680.14 |
|  2996.0 | 6.66667 | 6303.67  | 4087 | 2287.19 |
+---------+---------+----------+------+---------+
[10 rows x 19 columns]


In [ ]:
test_full = feature_w11(test, lag_sum=1, prior_sum=1)

In [64]:
test_full = test.copy()
ids = test['id']
del test['id']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demanda_uni_equil')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag']
del test_full['lag']
del test_full['Demanda_uni_equil']

In [65]:


In [66]:
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})

In [69]:
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))

In [80]:
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path+file_name,format='csv')

In [70]:
sub


Out[70]:
Demanda_uni_equil id
3.2877291637 4721633
2.24223753928 6035419
1.68033878328 3462602
3.8207045231 855102
8.93359333692 3242700
11.8469821368 2421613
3.53498757097 1573296
8.40957090009 1619375
1.75464872086 3539201
11.0039233874 1631781
[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.