notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
from graphlab import mxnet as mx



In [3]:

    
'''钢炮'''
path = '/home/zongyi/bimbo_data/'



In [13]:

    
train = gl.SFrame.read_csv(path + 'train_lag5_w9_mean.csv', verbose=False)



In [ ]:



In [ ]:

    
# Define the network symbol, equivalent to linear regression
net = mx.symbol.Variable('data')
net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=1)
net = mx.symbol.LinearRegressionOutput(data=net, name='lr')

# Load data into SFrame and normalize features
# sf = gl.SFrame.read_csv('https://static.turi.com/datasets/regression/houses.csv')
# features = ['tax', 'bedroom', 'bath', 'size', 'lot']
features =  ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'lag1','lag2','lag3','lag4','lag5','n_a','n_r','n_c','n_p','mpca']
# for f in features:
#     sf[f] = sf[f] - sf[f].mean()
#     sf[f] = sf[f] / sf[f].std()

# Prepare the input iterator from SFrame
# `data_name` must match the first layer's name of the network.
# `label_name` must match the last layer's name plus "_label".
dataiter = mx.io.SFrameIter(train, data_field=features, label_field='Demada_log',
                          data_name='data', label_name='lr_label',
                          batch_size=1)

# Train the network
model = mx.model.FeedForward.create(symbol=net, X=dataiter, num_epoch=20,
                                  learning_rate=1e-2,
                                  eval_metric='rmse')

# Make prediction
model.predict(dataiter)









    



[INFO] graphlab.mxnet.model: Start training with [cpu(0)]



In [4]:

    
# town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
# train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
# train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
# del train['Town']
# del train['t_c']



In [14]:

    
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']



In [15]:

    
del train['prior_sum']
del train['lag_sum']
del train['week_times']
del train['Semana']
del train['Canal_ID']
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
# train = train.fillna('lag_sum',0)
# train = train.fillna('prior_sum',0)
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
train









    Out[15]:





    
        Agencia_ID
        Ruta_SAK
        Cliente_ID
        Producto_ID
        Demada_log
        lag1
        lag2
        lag3
        lag4
        lag5
    
    
        2234
        1234
        2308240
        1240
        2.19722
        1.79176
        0.693147
        1.94591
        0.0
        0.0
    
    
        1121
        1418
        169387
        1240
        1.38629
        1.60944
        2.3979
        1.38629
        0.0
        0.0
    
    
        3215
        4435
        1280994
        43285
        1.79176
        2.3979
        1.79176
        2.70805
        2.3979
        2.83321
    
    
        1122
        1456
        80730
        1220
        1.94591
        2.19722
        1.94591
        1.09861
        2.3979
        1.94591
    
    
        2237
        1040
        764465
        2233
        2.30259
        2.48491
        2.07944
        2.3979
        1.09861
        2.30259
    
    
        2271
        3218
        76576
        1700
        3.04452
        2.83321
        2.48491
        2.83321
        3.04452
        3.04452
    
    
        1223
        4453
        168981
        37058
        2.07944
        1.09861
        1.09861
        1.38629
        1.60944
        1.79176
    
    
        1463
        1252
        691747
        34213
        1.09861
        1.09861
        1.09861
        1.09861
        0.693147
        0.693147
    
    
        4049
        1133
        4206501
        43201
        0.693147
        0.693147
        1.09861
        1.09861
        0.693147
        0.693147
    
    
        1616
        1223
        2237452
        2233
        0.693147
        0.693147
        0.693147
        0.693147
        0.0
        0.0
    


    
        n_a
        n_r
        n_c
        n_p
        mpca
    
    
        11387.0
        11333.0
        9.33333
        142714.0
        1.4205
    
    
        42768.0
        6322.33
        14.6667
        142714.0
        1.73313
    
    
        16032.3
        1144.33
        25.6667
        98334.7
        2.37015
    
    
        35329.7
        2842.67
        7.33333
        66750.7
        1.78089
    
    
        37287.3
        6096.33
        5.0
        161730.0
        2.01048
    
    
        1207.33
        698.333
        8.33333
        2861.0
        2.70704
    
    
        33960.0
        3929.67
        29.6667
        67764.3
        1.45274
    
    
        28276.3
        18824.7
        5.0
        18067.3
        0.961918
    
    
        27298.3
        10323.0
        9.66667
        15866.7
        0.905393
    
    
        25933.7
        21958.3
        4.0
        161730.0
        0.819517
    

[10408713 rows x 15 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [5]:

    
# Make a train-test split
train_data, test_data = train.random_split(0.99,seed=788)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=500,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=78,
                                          column_subsample=0.6,
                                          row_subsample=0.85,
                                          validation_set=test_data,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=500,
                                          resume_from_checkpoint=path+'model_checkpoint_1000_w9')

#500       | 14060.836344 | 0.435407      | 0.447519









    




WARNING: Detected extremely low variance for feature(s) 'Semana' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.






    




Resuming from checkpoint at /home/zongyi/bimbo_data/model_checkpoint_1000_w9






    




Boosted trees regression:






    




--------------------------------------------------------






    




Number of examples          : 10408713






    




Number of features          : 24






    




Number of unpacked features : 24






    




Warning: ignoring provided value of column_subsample which is different from the model checkpoint






    




Warning: ignoring provided value of max_iterations which is different from the model checkpoint






    




Warning: ignoring provided value of model_checkpoint_interval which is different from the model checkpoint






    




Warning: ignoring provided value of random_seed which is different from the model checkpoint






    




Resumed training from checkpoint at iteration 1000 which is greater than or equal to max_iterations 1000



In [43]:

    
w = model.get_feature_importance()
w = w.add_row_number()
w









    Out[43]:





    
        id
        name
        index
        count
    
    
        0
        Cliente_ID
        None
        37954
    
    
        1
        Ruta_SAK
        None
        37902
    
    
        2
        n_c
        None
        27015
    
    
        3
        n_r
        None
        26638
    
    
        4
        Producto_ID
        None
        25591
    
    
        5
        tp_sum
        None
        24838
    
    
        6
        Agencia_ID
        None
        24514
    
    
        7
        n_p
        None
        24358
    
    
        8
        prior_sum
        None
        22736
    
    
        9
        lag_sum
        None
        22532
    

[17 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [44]:

    
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(16, 9)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)









    Out[44]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 <a list of 17 Text xticklabel objects>)



In [20]:

    
# Save predictions to an SArray
predictions = model.predict(train)

# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
print results









    



{'max_error': 6.300516724586487, 'rmse': 0.4389403189567331}

predict



In [88]:

    
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    del test_full['Semana']
    return test_full



In [87]:

    
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
del test['Town']
del test['t_c']
del test['n_t']



In [89]:

    
# del test['prior_sum']
# del test['lag_sum']
del test['week_times']
# del test['Semana']
del test['Canal_ID']
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test = test.fillna('lag_sum',0)
test = test.fillna('prior_sum',0)
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
print test.head()









    



+---------+--------+------------+----------+------------+-------------+---------+
|    id   | Semana | Agencia_ID | Ruta_SAK | Cliente_ID | Producto_ID |   lag1  |
+---------+--------+------------+----------+------------+-------------+---------+
| 4721633 |   10   |    4037    |   1153   |  2398808   |    43202    |   0.0   |
| 6035419 |   10   |    2239    |   1157   |  1529806   |     1230    | 1.09861 |
| 3462602 |   10   |    1366    |   1051   |  1368806   |     1146    | 1.09861 |
|  855102 |   10   |    1911    |   1156   |   867160   |     1232    | 1.38629 |
| 3242700 |   11   |    1952    |   5721   |  1126647   |    47612    |   0.0   |
| 2421613 |   10   |    2653    |   2117   |   954439   |    35305    | 2.77259 |
| 1573296 |   10   |    1427    |   1102   |  4561415   |    43197    | 1.38629 |
| 1619375 |   10   |    1629    |   1013   |   169508   |     1146    | 2.07944 |
| 3539201 |   11   |    1120    |   1453   |  4602755   |     1230    |   0.0   |
| 1631781 |   10   |    1385    |   6617   |   20130    |     2025    | 2.56495 |
+---------+--------+------------+----------+------------+-------------+---------+
+----------+----------+----------+----------+---------+-----------+---------+
|   lag2   |   lag3   |   lag4   |   lag5   | lag_sum | prior_sum |   n_a   |
+----------+----------+----------+----------+---------+-----------+---------+
|   0.0    | 1.60944  | 1.94591  | 1.38629  | 4.94164 |  4.94164  | 34667.0 |
| 1.38629  |   0.0    |   0.0    |   0.0    | 2.48491 |  3.17805  | 15941.0 |
| 0.693147 | 0.693147 | 1.38629  | 0.693147 | 4.56435 |   5.2575  | 10191.7 |
|   0.0    |   0.0    | 0.693147 | 1.38629  | 3.46574 |  6.68461  | 63002.3 |
| 3.09104  | 2.99573  | 2.07944  |   0.0    | 8.16622 |  8.16622  | 12083.3 |
| 3.04452  | 2.56495  |   0.0    |   0.0    | 8.38206 |  8.38206  | 35352.0 |
| 1.79176  | 1.94591  | 1.94591  | 1.94591  | 9.01578 |  10.9617  | 15958.0 |
| 2.19722  | 1.79176  | 2.07944  | 2.19722  | 10.3451 |  14.4394  | 39959.7 |
| 1.38629  | 1.38629  | 1.09861  | 1.38629  |  5.2575 |  6.35611  | 36688.3 |
|   0.0    | 2.56495  | 2.48491  | 2.07944  | 9.69425 |  14.7441  | 3128.33 |
+----------+----------+----------+----------+---------+-----------+---------+
+---------+---------+----------+------+---------+
|   n_r   |   n_c   |   n_p    | tcc  |  tp_sum |
+---------+---------+----------+------+---------+
| 14727.7 | 18.6667 | 26207.7  | 5437 | 22072.1 |
| 12876.3 |   19.0  | 48687.7  | 2239 | 7288.35 |
| 12811.0 | 7.66667 | 117553.0 | 1275 | 6922.05 |
| 14546.7 |   28.0  | 107057.0 | 8302 | 18695.9 |
| 250.667 |   3.0   |  922.0   | 1598 | 1831.61 |
| 6104.33 | 8.66667 | 119140.0 | 4757 | 9574.76 |
| 26384.0 |   13.0  | 23312.3  | 2526 | 10830.0 |
| 15596.7 | 31.6667 | 117553.0 | 5960 | 15834.2 |
|  2989.0 |   27.0  | 48687.7  | 5094 | 7680.14 |
|  2996.0 | 6.66667 | 6303.67  | 4087 | 2287.19 |
+---------+---------+----------+------+---------+
[10 rows x 19 columns]



In [ ]:

    
test_full = feature_w11(test, lag_sum=1, prior_sum=1)



In [64]:

    
test_full = test.copy()
ids = test['id']
del test['id']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demanda_uni_equil')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag']
del test_full['lag']
del test_full['Demanda_uni_equil']



In [65]:



In [66]:

    
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})



In [69]:

    
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))



In [80]:

    
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path+file_name,format='csv')



In [70]:

    
sub









    Out[70]:





    
        Demanda_uni_equil
        id
    
    
        3.2877291637
        4721633
    
    
        2.24223753928
        6035419
    
    
        1.68033878328
        3462602
    
    
        3.8207045231
        855102
    
    
        8.93359333692
        3242700
    
    
        11.8469821368
        2421613
    
    
        3.53498757097
        1573296
    
    
        8.40957090009
        1619375
    
    
        1.75464872086
        3539201
    
    
        11.0039233874
        1631781
    

[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Agencia_ID	Ruta_SAK	Cliente_ID	Producto_ID	Demada_log	lag1	lag2	lag3	lag4	lag5
2234	1234	2308240	1240	2.19722	1.79176	0.693147	1.94591	0.0	0.0
1121	1418	169387	1240	1.38629	1.60944	2.3979	1.38629	0.0	0.0
3215	4435	1280994	43285	1.79176	2.3979	1.79176	2.70805	2.3979	2.83321
1122	1456	80730	1220	1.94591	2.19722	1.94591	1.09861	2.3979	1.94591
2237	1040	764465	2233	2.30259	2.48491	2.07944	2.3979	1.09861	2.30259
2271	3218	76576	1700	3.04452	2.83321	2.48491	2.83321	3.04452	3.04452
1223	4453	168981	37058	2.07944	1.09861	1.09861	1.38629	1.60944	1.79176
1463	1252	691747	34213	1.09861	1.09861	1.09861	1.09861	0.693147	0.693147
4049	1133	4206501	43201	0.693147	0.693147	1.09861	1.09861	0.693147	0.693147
1616	1223	2237452	2233	0.693147	0.693147	0.693147	0.693147	0.0	0.0

n_a	n_r	n_c	n_p	mpca
11387.0	11333.0	9.33333	142714.0	1.4205
42768.0	6322.33	14.6667	142714.0	1.73313
16032.3	1144.33	25.6667	98334.7	2.37015
35329.7	2842.67	7.33333	66750.7	1.78089
37287.3	6096.33	5.0	161730.0	2.01048
1207.33	698.333	8.33333	2861.0	2.70704
33960.0	3929.67	29.6667	67764.3	1.45274
28276.3	18824.7	5.0	18067.3	0.961918
27298.3	10323.0	9.66667	15866.7	0.905393
25933.7	21958.3	4.0	161730.0	0.819517

id	name	index	count
0	Cliente_ID	None	37954
1	Ruta_SAK	None	37902
2	n_c	None	27015
3	n_r	None	26638
4	Producto_ID	None	25591
5	tp_sum	None	24838
6	Agencia_ID	None	24514
7	n_p	None	24358
8	prior_sum	None	22736
9	lag_sum	None	22532

Demanda_uni_equil	id
3.2877291637	4721633
2.24223753928	6035419
1.68033878328	3462602
3.8207045231	855102
8.93359333692	3242700
11.8469821368	2421613
3.53498757097	1573296
8.40957090009	1619375
1.75464872086	3539201
11.0039233874	1631781