In [2]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray

In [3]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'

In [4]:
train = gl.SFrame.read_csv(path + 'train_lag5_w9.csv', verbose=False)


/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:315: SNIMissingWarning: An HTTPS request has been made, but the SNI (Subject Name Indication) extension to TLS is not available on this platform. This may cause the server to present an incorrect TLS certificate, which can cause validation failures. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#snimissingwarning.
  SNIMissingWarning
/usr/local/lib/python2.7/dist-packages/requests/packages/urllib3/util/ssl_.py:120: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
This non-commercial license of GraphLab Create for academic use is assigned to zong-yi.liu@irit.fr and will expire on July 13, 2017.
[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1472143272.log

In [4]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
train = train.fillna('t_c',1)
train = train.fillna('tcc',0)
train = train.fillna('tp_sum',0)
del train['Town']
del train['t_c']

In [5]:
del train['n_t']

In [6]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']

In [6]:
# relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
# train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
# train = train.fillna('re_lag1',0)
# train = train.fillna('re_lag2',0)
# train = train.fillna('re_lag3',0)
# train = train.fillna('re_lag4',0)
# train = train.fillna('re_lag5',0)
# del relag_train

In [7]:
# pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
# train = train.join(pd, on=['Producto_ID'], how='left')
# train = train.fillna('prom',0)
# train = train.fillna('weight',0)
# train = train.fillna('pieces',1)
# train = train.fillna('w_per_piece',0)
# train = train.fillna('healthy',0)
# train = train.fillna('drink',0)
# del train['brand']
# del train['NombreProducto']
# del pd

In [8]:
# client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
# train = train.join(client, on=['Cliente_ID'], how='left')
# del client

In [9]:
# cluster = gl.SFrame.read_csv(path + 'prod_cluster.csv', verbose=False)
# cluster = cluster[['Producto_ID','cluster']]
# train = train.join(cluster, on=['Producto_ID'], how='left')

In [36]:
# del train['week_times']
# del train['Semana']
# del train['Canal_ID']
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
train = train.fillna('lag_sum',0)
train = train.fillna('prior_sum',0)
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
train


Out[36]:
Agencia_ID Ruta_SAK Cliente_ID Producto_ID Demada_log lag1 lag2 lag3 lag4 lag5
2234 1234 2308240 1240 2.19722 1.79176 0.693147 1.94591 0.0 0.0
1121 1415 1758210 35651 2.3979 1.94591 2.56495 3.4012 2.48491 2.56495
2032 1267 3554707 1240 1.38629 1.60944 1.60944 1.38629 1.79176 1.94591
2220 1227 2492643 1109 1.09861 1.09861 0.693147 1.09861 1.09861 0.0
1612 1139 1965168 972 1.60944 1.09861 0.693147 1.38629 1.09861 1.79176
2230 1002 2482931 3631 2.48491 2.3979 2.56495 2.56495 2.30259 1.09861
1121 1049 2449109 1125 2.19722 3.29584 3.04452 1.94591 3.17805 2.77259
1121 1418 169387 1240 1.38629 1.60944 2.3979 1.38629 0.0 0.0
4041 1256 205279 1240 1.09861 1.09861 1.09861 1.09861 1.38629 1.09861
3215 4435 1280994 43285 1.79176 2.3979 1.79176 2.70805 2.3979 2.83321
lag_sum prior_sum n_a n_r n_c n_p tcc tp_sum
4.43082 4.43082 11387.0 11333.0 9.33333 142714.0 1474 7032.04
12.9619 12.9619 42768.0 5594.0 16.6667 90854.7 5404 27611.6
8.34284 10.1346 32083.0 10454.0 7.33333 142714.0 4551 28371.8
3.98898 3.98898 13659.3 16880.3 5.66667 111838.0 1894 7265.43
6.06842 7.16704 16434.0 6750.67 14.6667 43849.3 2315 3132.52
10.929 13.3269 24739.7 21504.0 32.3333 66206.0 3509 1317.56
14.2369 17.495 42768.0 2840.67 20.3333 105378.0 5404 19233.5
5.39363 5.39363 42768.0 6322.33 14.6667 142714.0 5404 23690.2
5.78073 7.16704 22477.7 16456.7 8.66667 142714.0 3356 19100.2
12.1288 12.1288 16032.3 1144.33 25.6667 98334.7 7784 36823.3
[10408713 rows x 18 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [5]:
# Make a train-test split
train_data, test_data = train.random_split(0.99,seed=788)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=500,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=78,
                                          column_subsample=0.6,
                                          row_subsample=0.85,
                                          validation_set=test_data,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=500,
                                          resume_from_checkpoint=path+'model_checkpoint_1000_w9')

#500       | 14060.836344 | 0.435407      | 0.447519


WARNING: Detected extremely low variance for feature(s) 'Semana' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Resuming from checkpoint at /home/zongyi/bimbo_data/model_checkpoint_1000_w9
Boosted trees regression:
--------------------------------------------------------
Number of examples          : 10408713
Number of features          : 24
Number of unpacked features : 24
Warning: ignoring provided value of column_subsample which is different from the model checkpoint
Warning: ignoring provided value of max_iterations which is different from the model checkpoint
Warning: ignoring provided value of model_checkpoint_interval which is different from the model checkpoint
Warning: ignoring provided value of random_seed which is different from the model checkpoint
Resumed training from checkpoint at iteration 1000 which is greater than or equal to max_iterations 1000

In [7]:
model.features


Out[7]:
['Agencia_ID',
 'Canal_ID',
 'Ruta_SAK',
 'Cliente_ID',
 'Producto_ID',
 'lag1',
 'lag2',
 'lag3',
 'lag4',
 'lag5',
 'n_a',
 'n_r',
 'n_c',
 'n_p']

In [43]:
w = model.get_feature_importance()
w = w.add_row_number()
w


Out[43]:
id name index count
0 Cliente_ID None 37954
1 Ruta_SAK None 37902
2 n_c None 27015
3 n_r None 26638
4 Producto_ID None 25591
5 tp_sum None 24838
6 Agencia_ID None 24514
7 n_p None 24358
8 prior_sum None 22736
9 lag_sum None 22532
[17 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [44]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'}) 
%matplotlib inline

figsize(16, 9)
plt.bar(w['id'], w['count'], tick_label=w['name'])

plt.xticks(rotation=45)


Out[44]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 <a list of 17 Text xticklabel objects>)

In [20]:
# Save predictions to an SArray
predictions = model.predict(train)

# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
print results


{'max_error': 6.300516724586487, 'rmse': 0.4389403189567331}

predict


In [88]:
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
    test_full = test.copy()
    ids = test['id']
    del test['id']
    del test['Semana']
    demand_log = model.predict(test)
    sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
    test_full = test_full.join(sub1,on=['id'],how='left')
    lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
    lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
    test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
    test_full = test_full.fillna('lag11',0)
    test_full['lag1'] = test_full['lag1'] + test_full['lag11']
    
    if lag_sum == 1:
        test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
        
    if prior_sum == 1:
        lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
        lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
        test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
        test_full = test_full.fillna('lag_sum11',0)
        test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
        del test_full['lag_sum11']

    del test_full['lag11']
    del test_full['Demanda_uni_equil']
    del test_full['Semana']
    return test_full

In [87]:
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
del test['Town']
del test['t_c']
del test['n_t']

In [89]:
# del test['prior_sum']
# del test['lag_sum']
del test['week_times']
# del test['Semana']
del test['Canal_ID']
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test = test.fillna('lag_sum',0)
test = test.fillna('prior_sum',0)
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
print test.head()


+---------+--------+------------+----------+------------+-------------+---------+
|    id   | Semana | Agencia_ID | Ruta_SAK | Cliente_ID | Producto_ID |   lag1  |
+---------+--------+------------+----------+------------+-------------+---------+
| 4721633 |   10   |    4037    |   1153   |  2398808   |    43202    |   0.0   |
| 6035419 |   10   |    2239    |   1157   |  1529806   |     1230    | 1.09861 |
| 3462602 |   10   |    1366    |   1051   |  1368806   |     1146    | 1.09861 |
|  855102 |   10   |    1911    |   1156   |   867160   |     1232    | 1.38629 |
| 3242700 |   11   |    1952    |   5721   |  1126647   |    47612    |   0.0   |
| 2421613 |   10   |    2653    |   2117   |   954439   |    35305    | 2.77259 |
| 1573296 |   10   |    1427    |   1102   |  4561415   |    43197    | 1.38629 |
| 1619375 |   10   |    1629    |   1013   |   169508   |     1146    | 2.07944 |
| 3539201 |   11   |    1120    |   1453   |  4602755   |     1230    |   0.0   |
| 1631781 |   10   |    1385    |   6617   |   20130    |     2025    | 2.56495 |
+---------+--------+------------+----------+------------+-------------+---------+
+----------+----------+----------+----------+---------+-----------+---------+
|   lag2   |   lag3   |   lag4   |   lag5   | lag_sum | prior_sum |   n_a   |
+----------+----------+----------+----------+---------+-----------+---------+
|   0.0    | 1.60944  | 1.94591  | 1.38629  | 4.94164 |  4.94164  | 34667.0 |
| 1.38629  |   0.0    |   0.0    |   0.0    | 2.48491 |  3.17805  | 15941.0 |
| 0.693147 | 0.693147 | 1.38629  | 0.693147 | 4.56435 |   5.2575  | 10191.7 |
|   0.0    |   0.0    | 0.693147 | 1.38629  | 3.46574 |  6.68461  | 63002.3 |
| 3.09104  | 2.99573  | 2.07944  |   0.0    | 8.16622 |  8.16622  | 12083.3 |
| 3.04452  | 2.56495  |   0.0    |   0.0    | 8.38206 |  8.38206  | 35352.0 |
| 1.79176  | 1.94591  | 1.94591  | 1.94591  | 9.01578 |  10.9617  | 15958.0 |
| 2.19722  | 1.79176  | 2.07944  | 2.19722  | 10.3451 |  14.4394  | 39959.7 |
| 1.38629  | 1.38629  | 1.09861  | 1.38629  |  5.2575 |  6.35611  | 36688.3 |
|   0.0    | 2.56495  | 2.48491  | 2.07944  | 9.69425 |  14.7441  | 3128.33 |
+----------+----------+----------+----------+---------+-----------+---------+
+---------+---------+----------+------+---------+
|   n_r   |   n_c   |   n_p    | tcc  |  tp_sum |
+---------+---------+----------+------+---------+
| 14727.7 | 18.6667 | 26207.7  | 5437 | 22072.1 |
| 12876.3 |   19.0  | 48687.7  | 2239 | 7288.35 |
| 12811.0 | 7.66667 | 117553.0 | 1275 | 6922.05 |
| 14546.7 |   28.0  | 107057.0 | 8302 | 18695.9 |
| 250.667 |   3.0   |  922.0   | 1598 | 1831.61 |
| 6104.33 | 8.66667 | 119140.0 | 4757 | 9574.76 |
| 26384.0 |   13.0  | 23312.3  | 2526 | 10830.0 |
| 15596.7 | 31.6667 | 117553.0 | 5960 | 15834.2 |
|  2989.0 |   27.0  | 48687.7  | 5094 | 7680.14 |
|  2996.0 | 6.66667 | 6303.67  | 4087 | 2287.19 |
+---------+---------+----------+------+---------+
[10 rows x 19 columns]


In [ ]:
test_full = feature_w11(test, lag_sum=1, prior_sum=1)

In [64]:
test_full = test.copy()
ids = test['id']
del test['id']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demanda_uni_equil')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag']
del test_full['lag']
del test_full['Demanda_uni_equil']

In [65]:


In [66]:
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})

In [69]:
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))

In [80]:
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path+file_name,format='csv')

In [70]:
sub


Out[70]:
Demanda_uni_equil id
3.2877291637 4721633
2.24223753928 6035419
1.68033878328 3462602
3.8207045231 855102
8.93359333692 3242700
11.8469821368 2421613
3.53498757097 1573296
8.40957090009 1619375
1.75464872086 3539201
11.0039233874 1631781
[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.