notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray



In [3]:

    
'''钢炮'''
path = '/home/zongyi/bimbo_data/'



In [4]:

    
train = gl.SFrame.read_csv(path + 'train_lag5_w9.csv', verbose=False)









    



/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:315: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#snimissingwarning.
  SNIMissingWarning
/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:120: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning






    



This non-commercial license of GraphLab Create for academic use is assigned to zong-yi.liu@irit.fr and will expire on July 13, 2017.






    



[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1472143272.log



In [4]:

    
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
train = train.fillna('t_c',1)
train = train.fillna('tcc',0)
train = train.fillna('tp_sum',0)
del train['Town']
del train['t_c']



In [5]:

    
del train['n_t']



In [6]:

    
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']



In [6]:

    
# relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
# train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
# train = train.fillna('re_lag1',0)
# train = train.fillna('re_lag2',0)
# train = train.fillna('re_lag3',0)
# train = train.fillna('re_lag4',0)
# train = train.fillna('re_lag5',0)
# del relag_train



In [7]:

    
# pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
# train = train.join(pd, on=['Producto_ID'], how='left')
# train = train.fillna('prom',0)
# train = train.fillna('weight',0)
# train = train.fillna('pieces',1)
# train = train.fillna('w_per_piece',0)
# train = train.fillna('healthy',0)
# train = train.fillna('drink',0)
# del train['brand']
# del train['NombreProducto']
# del pd



In [8]:

    
# client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
# train = train.join(client, on=['Cliente_ID'], how='left')
# del client



In [9]:

    
# cluster = gl.SFrame.read_csv(path + 'prod_cluster.csv', verbose=False)
# cluster = cluster[['Producto_ID','cluster']]
# train = train.join(cluster, on=['Producto_ID'], how='left')



In [36]:

    
# del train['week_times']
# del train['Semana']
# del train['Canal_ID']
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
train = train.fillna('lag_sum',0)
train = train.fillna('prior_sum',0)
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
train









    Out[36]:





    
        Agencia_ID
        Ruta_SAK
        Cliente_ID
        Producto_ID
        Demada_log
        lag1
        lag2
        lag3
        lag4
        lag5
    
    
        2234
        1234
        2308240
        1240
        2.19722
        1.79176
        0.693147
        1.94591
        0.0
        0.0
    
    
        1121
        1415
        1758210
        35651
        2.3979
        1.94591
        2.56495
        3.4012
        2.48491
        2.56495
    
    
        2032
        1267
        3554707
        1240
        1.38629
        1.60944
        1.60944
        1.38629
        1.79176
        1.94591
    
    
        2220
        1227
        2492643
        1109
        1.09861
        1.09861
        0.693147
        1.09861
        1.09861
        0.0
    
    
        1612
        1139
        1965168
        972
        1.60944
        1.09861
        0.693147
        1.38629
        1.09861
        1.79176
    
    
        2230
        1002
        2482931
        3631
        2.48491
        2.3979
        2.56495
        2.56495
        2.30259
        1.09861
    
    
        1121
        1049
        2449109
        1125
        2.19722
        3.29584
        3.04452
        1.94591
        3.17805
        2.77259
    
    
        1121
        1418
        169387
        1240
        1.38629
        1.60944
        2.3979
        1.38629
        0.0
        0.0
    
    
        4041
        1256
        205279
        1240
        1.09861
        1.09861
        1.09861
        1.09861
        1.38629
        1.09861
    
    
        3215
        4435
        1280994
        43285
        1.79176
        2.3979
        1.79176
        2.70805
        2.3979
        2.83321
    


    
        lag_sum
        prior_sum
        n_a
        n_r
        n_c
        n_p
        tcc
        tp_sum
    
    
        4.43082
        4.43082
        11387.0
        11333.0
        9.33333
        142714.0
        1474
        7032.04
    
    
        12.9619
        12.9619
        42768.0
        5594.0
        16.6667
        90854.7
        5404
        27611.6
    
    
        8.34284
        10.1346
        32083.0
        10454.0
        7.33333
        142714.0
        4551
        28371.8
    
    
        3.98898
        3.98898
        13659.3
        16880.3
        5.66667
        111838.0
        1894
        7265.43
    
    
        6.06842
        7.16704
        16434.0
        6750.67
        14.6667
        43849.3
        2315
        3132.52
    
    
        10.929
        13.3269
        24739.7
        21504.0
        32.3333
        66206.0
        3509
        1317.56
    
    
        14.2369
        17.495
        42768.0
        2840.67
        20.3333
        105378.0
        5404
        19233.5
    
    
        5.39363
        5.39363
        42768.0
        6322.33
        14.6667
        142714.0
        5404
        23690.2
    
    
        5.78073
        7.16704
        22477.7
        16456.7
        8.66667
        142714.0
        3356
        19100.2
    
    
        12.1288
        12.1288
        16032.3
        1144.33
        25.6667
        98334.7
        7784
        36823.3
    

[10408713 rows x 18 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [5]:

    
# Make a train-test split
train_data, test_data = train.random_split(0.99,seed=788)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=500,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=78,
                                          column_subsample=0.6,
                                          row_subsample=0.85,
                                          validation_set=test_data,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=500,
                                          resume_from_checkpoint=path+'model_checkpoint_1000_w9')

#500       | 14060.836344 | 0.435407      | 0.447519









    




WARNING: Detected extremely low variance for feature(s) 'Semana' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.






    




Resuming from checkpoint at /home/zongyi/bimbo_data/model_checkpoint_1000_w9






    




Boosted trees regression:






    




--------------------------------------------------------






    




Number of examples          : 10408713






    




Number of features          : 24






    




Number of unpacked features : 24






    




Warning: ignoring provided value of column_subsample which is different from the model checkpoint






    




Warning: ignoring provided value of max_iterations which is different from the model checkpoint






    




Warning: ignoring provided value of model_checkpoint_interval which is different from the model checkpoint






    




Warning: ignoring provided value of random_seed which is different from the model checkpoint






    




Resumed training from checkpoint at iteration 1000 which is greater than or equal to max_iterations 1000



In [7]:

    
model.features









    Out[7]:





['Agencia_ID',
 'Canal_ID',
 'Ruta_SAK',
 'Cliente_ID',
 'Producto_ID',
 'lag1',
 'lag2',
 'lag3',
 'lag4',
 'lag5',
 'n_a',
 'n_r',
 'n_c',
 'n_p']



In [43]:

    
w = model.get_feature_importance()
w = w.add_row_number()
w









    Out[43]:





    
        id
        name
        index
        count
    
    
        0
        Cliente_ID
        None
        37954
    
    
        1
        Ruta_SAK
        None
        37902
    
    
        2
        n_c
        None
        27015
    
    
        3
        n_r
        None
        26638
    
    
        4
        Producto_ID
        None
        25591
    
    
        5
        tp_sum
        None
        24838
    
    
        6
        Agencia_ID
        None
        24514
    
    
        7
        n_p
        None
        24358
    
    
        8
        prior_sum
        None
        22736
    
    
        9
        lag_sum
        None
        22532
    

[17 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [44]:

    
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(16, 9)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)









    Out[44]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 <a list of 17 Text xticklabel objects>)



In [20]:

    
# Save predictions to an SArray
predictions = model.predict(train)

# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
print results









    



{'max_error': 6.300516724586487, 'rmse': 0.4389403189567331}

predict



In [88]:

    
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    del test_full['Semana']
    return test_full



In [87]:

    
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
del test['Town']
del test['t_c']
del test['n_t']



In [89]:

    
# del test['prior_sum']
# del test['lag_sum']
del test['week_times']
# del test['Semana']
del test['Canal_ID']
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test = test.fillna('lag_sum',0)
test = test.fillna('prior_sum',0)
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
print test.head()









    



+---------+--------+------------+----------+------------+-------------+---------+
|    id   | Semana | Agencia_ID | Ruta_SAK | Cliente_ID | Producto_ID |   lag1  |
+---------+--------+------------+----------+------------+-------------+---------+
| 4721633 |   10   |    4037    |   1153   |  2398808   |    43202    |   0.0   |
| 6035419 |   10   |    2239    |   1157   |  1529806   |     1230    | 1.09861 |
| 3462602 |   10   |    1366    |   1051   |  1368806   |     1146    | 1.09861 |
|  855102 |   10   |    1911    |   1156   |   867160   |     1232    | 1.38629 |
| 3242700 |   11   |    1952    |   5721   |  1126647   |    47612    |   0.0   |
| 2421613 |   10   |    2653    |   2117   |   954439   |    35305    | 2.77259 |
| 1573296 |   10   |    1427    |   1102   |  4561415   |    43197    | 1.38629 |
| 1619375 |   10   |    1629    |   1013   |   169508   |     1146    | 2.07944 |
| 3539201 |   11   |    1120    |   1453   |  4602755   |     1230    |   0.0   |
| 1631781 |   10   |    1385    |   6617   |   20130    |     2025    | 2.56495 |
+---------+--------+------------+----------+------------+-------------+---------+
+----------+----------+----------+----------+---------+-----------+---------+
|   lag2   |   lag3   |   lag4   |   lag5   | lag_sum | prior_sum |   n_a   |
+----------+----------+----------+----------+---------+-----------+---------+
|   0.0    | 1.60944  | 1.94591  | 1.38629  | 4.94164 |  4.94164  | 34667.0 |
| 1.38629  |   0.0    |   0.0    |   0.0    | 2.48491 |  3.17805  | 15941.0 |
| 0.693147 | 0.693147 | 1.38629  | 0.693147 | 4.56435 |   5.2575  | 10191.7 |
|   0.0    |   0.0    | 0.693147 | 1.38629  | 3.46574 |  6.68461  | 63002.3 |
| 3.09104  | 2.99573  | 2.07944  |   0.0    | 8.16622 |  8.16622  | 12083.3 |
| 3.04452  | 2.56495  |   0.0    |   0.0    | 8.38206 |  8.38206  | 35352.0 |
| 1.79176  | 1.94591  | 1.94591  | 1.94591  | 9.01578 |  10.9617  | 15958.0 |
| 2.19722  | 1.79176  | 2.07944  | 2.19722  | 10.3451 |  14.4394  | 39959.7 |
| 1.38629  | 1.38629  | 1.09861  | 1.38629  |  5.2575 |  6.35611  | 36688.3 |
|   0.0    | 2.56495  | 2.48491  | 2.07944  | 9.69425 |  14.7441  | 3128.33 |
+----------+----------+----------+----------+---------+-----------+---------+
+---------+---------+----------+------+---------+
|   n_r   |   n_c   |   n_p    | tcc  |  tp_sum |
+---------+---------+----------+------+---------+
| 14727.7 | 18.6667 | 26207.7  | 5437 | 22072.1 |
| 12876.3 |   19.0  | 48687.7  | 2239 | 7288.35 |
| 12811.0 | 7.66667 | 117553.0 | 1275 | 6922.05 |
| 14546.7 |   28.0  | 107057.0 | 8302 | 18695.9 |
| 250.667 |   3.0   |  922.0   | 1598 | 1831.61 |
| 6104.33 | 8.66667 | 119140.0 | 4757 | 9574.76 |
| 26384.0 |   13.0  | 23312.3  | 2526 | 10830.0 |
| 15596.7 | 31.6667 | 117553.0 | 5960 | 15834.2 |
|  2989.0 |   27.0  | 48687.7  | 5094 | 7680.14 |
|  2996.0 | 6.66667 | 6303.67  | 4087 | 2287.19 |
+---------+---------+----------+------+---------+
[10 rows x 19 columns]



In [ ]:

    
test_full = feature_w11(test, lag_sum=1, prior_sum=1)



In [64]:

    
test_full = test.copy()
ids = test['id']
del test['id']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demanda_uni_equil')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag']
del test_full['lag']
del test_full['Demanda_uni_equil']



In [65]:



In [66]:

    
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})



In [69]:

    
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))



In [80]:

    
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path+file_name,format='csv')



In [70]:

    
sub









    Out[70]:





    
        Demanda_uni_equil
        id
    
    
        3.2877291637
        4721633
    
    
        2.24223753928
        6035419
    
    
        1.68033878328
        3462602
    
    
        3.8207045231
        855102
    
    
        8.93359333692
        3242700
    
    
        11.8469821368
        2421613
    
    
        3.53498757097
        1573296
    
    
        8.40957090009
        1619375
    
    
        1.75464872086
        3539201
    
    
        11.0039233874
        1631781
    

[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Agencia_ID	Ruta_SAK	Cliente_ID	Producto_ID	Demada_log	lag1	lag2	lag3	lag4	lag5
2234	1234	2308240	1240	2.19722	1.79176	0.693147	1.94591	0.0	0.0
1121	1415	1758210	35651	2.3979	1.94591	2.56495	3.4012	2.48491	2.56495
2032	1267	3554707	1240	1.38629	1.60944	1.60944	1.38629	1.79176	1.94591
2220	1227	2492643	1109	1.09861	1.09861	0.693147	1.09861	1.09861	0.0
1612	1139	1965168	972	1.60944	1.09861	0.693147	1.38629	1.09861	1.79176
2230	1002	2482931	3631	2.48491	2.3979	2.56495	2.56495	2.30259	1.09861
1121	1049	2449109	1125	2.19722	3.29584	3.04452	1.94591	3.17805	2.77259
1121	1418	169387	1240	1.38629	1.60944	2.3979	1.38629	0.0	0.0
4041	1256	205279	1240	1.09861	1.09861	1.09861	1.09861	1.38629	1.09861
3215	4435	1280994	43285	1.79176	2.3979	1.79176	2.70805	2.3979	2.83321

lag_sum	prior_sum	n_a	n_r	n_c	n_p	tcc	tp_sum
4.43082	4.43082	11387.0	11333.0	9.33333	142714.0	1474	7032.04
12.9619	12.9619	42768.0	5594.0	16.6667	90854.7	5404	27611.6
8.34284	10.1346	32083.0	10454.0	7.33333	142714.0	4551	28371.8
3.98898	3.98898	13659.3	16880.3	5.66667	111838.0	1894	7265.43
6.06842	7.16704	16434.0	6750.67	14.6667	43849.3	2315	3132.52
10.929	13.3269	24739.7	21504.0	32.3333	66206.0	3509	1317.56
14.2369	17.495	42768.0	2840.67	20.3333	105378.0	5404	19233.5
5.39363	5.39363	42768.0	6322.33	14.6667	142714.0	5404	23690.2
5.78073	7.16704	22477.7	16456.7	8.66667	142714.0	3356	19100.2
12.1288	12.1288	16032.3	1144.33	25.6667	98334.7	7784	36823.3

id	name	index	count
0	Cliente_ID	None	37954
1	Ruta_SAK	None	37902
2	n_c	None	27015
3	n_r	None	26638
4	Producto_ID	None	25591
5	tp_sum	None	24838
6	Agencia_ID	None	24514
7	n_p	None	24358
8	prior_sum	None	22736
9	lag_sum	None	22532

Demanda_uni_equil	id
3.2877291637	4721633
2.24223753928	6035419
1.68033878328	3462602
3.8207045231	855102
8.93359333692	3242700
11.8469821368	2421613
3.53498757097	1573296
8.40957090009	1619375
1.75464872086	3539201
11.0039233874	1631781