In [1]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray

In [2]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'

In [4]:
train = gl.SFrame.read_csv(path + 'train_lag5.csv', verbose=False)

In [8]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)

In [12]:
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')

In [9]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']
del train['Town']

In [ ]:
rl_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
clt = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)

In [68]:
cluster = gl.SFrame.read_csv(path + 'prod_cluster.csv', verbose=False)

In [70]:
cluster = cluster[['Producto_ID','cluster']]
train = train.join(cluster, on=['Producto_ID'], how='left')

In [66]:
train = train.join(rl_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
train = train.join(pd, on=['Producto_ID'], how='left')
train = train.join(clt, on=['Cliente_ID'], how='left')

In [71]:
train = train.fillna('re_lag1',0)
train = train.fillna('re_lag2',0)
train = train.fillna('re_lag3',0)
train = train.fillna('re_lag4',0)
train = train.fillna('re_lag5',0)
train = train.fillna('prom',0)
train = train.fillna('weight',0)
train = train.fillna('pieces',1)
train = train.fillna('w_per_piece',0)
train = train.fillna('healthy',0)
train = train.fillna('drink',0)
del train['brand']
del train['NombreProducto']
del rl_train
del pd
del clt

In [73]:
train


Out[73]:
Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID Demada_log lag1 lag2 lag3 lag4
9 1232 1 1007 2204561 1125 1.09861 0.0 0.0 0.0 0.0
9 1334 1 2004 4482236 31779 1.09861 0.0 0.0 0.0 0.0
9 2263 1 2805 736159 5621 0.693147 0.0 0.0 0.0 0.0
9 3221 1 1620 1748240 160 1.09861 0.0 0.0 0.0 0.0
9 2023 1 1208 2328381 1242 0.693147 0.0 0.0 0.0 0.0
9 1235 1 1110 267468 1220 1.09861 0.0 0.0 0.0 1.60944
9 4051 1 1215 121243 34255 1.09861 0.0 0.0 0.0 0.0
9 2214 1 1215 819679 1309 0.693147 0.0 0.0 0.0 0.0
9 1423 1 1205 8010586 34914 0.693147 0.0 0.0 0.693147 0.0
9 1351 1 1264 2359814 1232 1.09861 0.0 0.0 0.0 0.0
lag5 week_times lag_sum prior_sum n_a n_r n_c n_p t_c tcc tp_sum re_lag1
1.60944 1 1.60944 3.55535 68410.3 29312.3 34.4286 150267.0 2 4841 20252.3 0.0
0.693147 1 0.693147 1.38629 47270.7 20306.0 2.16667 5505.29 2 3308 380.017 0.0
0.0 1 0.0 0.0 31514.0 27657.0 20.0 9581.0 3 2892 2945.71 0.0
0.0 1 0.0 0.0 14062.0 9858.57 17.0 179.143 8 8144 94.1081 0.0
0.0 1 0.0 0.0 22247.7 52832.0 13.5714 291981.0 2 1557 9145.2 0.0
0.0 1 1.60944 1.60944 48509.6 29371.6 32.2857 127768.0 3 7463 19052.1 0.0
0.0 1 0.0 0.0 62044.6 46225.3 10.2857 12360.3 4 4532 12535.6 0.0
0.0 1 0.0 0.0 51983.3 46225.3 9.28571 180765.0 3 3780 10661.5 0.0
0.0 1 0.693147 0.693147 44791.3 58513.9 7.0 1953.14 6 3848 4631.22 0.0
1.09861 1 1.09861 2.19722 85080.3 22624.9 28.2857 210297.0 3 5972 13767.2 0.0
re_lag2 re_lag3 re_lag4 re_lag5 prom weight pieces w_per_piece healthy drink OXXO ARTELI ALSUPER
0.0 0.0 0 0 0 255.0 10.0 25.5 0 0 0 0 0
0.0 0.0 0 0 0 158.0 1.0 158.0 0 0 0 0 0
0.0 0.0 0 0 0 105.0 3.0 35.0 0 0 0 0 0
0.0 0.0 0 0 0 180.0 1.0 180.0 0 0 0 0 0
0.0 0.0 0 0 0 105.0 6.0 17.5 0 0 0 0 0
0.0 0.0 0 0 0 130.0 6.0 21.6666666667 0 0 0 0 0
0.0 0.0 0 0 0 360.0 30.0 12.0 0 0 0 0 0
0.0 0.0 0 0 0 66.0 4.0 16.5 0 0 0 0 0
0.0 0.0 0 0 0 280.0 1.0 280.0 0 0 0 0 0
0.0 0.0 0 0 0 255.0 1.0 255.0 0 0 0 0 0
BODEGA CALIMAX XICANS ABARROTES ...
0 0 0 0 ...
0 0 0 0 ...
0 0 0 0 ...
0 0 1 1 ...
0 0 0 0 ...
0 0 0 0 ...
0 0 0 0 ...
0 0 0 0 ...
0 0 0 0 ...
0 0 0 0 ...
[20982770 rows x 54 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [ ]:
# Make a train-test split
# train_data, test_data = train.random_split(0.9)

# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
                                           step_size=0.1,
                                           max_iterations=500,
                                           max_depth = 10,
                                          metric='rmse',
                                          random_seed=321,
                                          column_subsample=0.7,
                                          row_subsample=0.85,
                                          validation_set=None,
                                          model_checkpoint_path=path,
                                          model_checkpoint_interval=100)

# Save predictions to an SArray
predictions = model.predict(train)

# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)


External memory mode: 17 batches
Boosted trees regression:
--------------------------------------------------------
Number of examples          : 20982770
Number of features          : 53
Number of unpacked features : 53
Create disk column page 1/17
Create disk column page 2/17
Create disk column page 3/17
Create disk column page 4/17
Create disk column page 5/17
Create disk column page 6/17
Create disk column page 7/17
Create disk column page 8/17
Create disk column page 9/17
Create disk column page 10/17
Create disk column page 11/17
Create disk column page 12/17
Create disk column page 13/17
Create disk column page 14/17
Create disk column page 15/17
Create disk column page 16/17
Create disk column page 17/17
+-----------+--------------+---------------+
| Iteration | Elapsed Time | Training-rmse |
+-----------+--------------+---------------+
| 1         | 602.081921   | 1.266345      |
| 2         | 1175.433200  | 1.160358      |
| 3         | 1781.272537  | 1.067928      |
| 4         | 2351.885424  | 0.984901      |
| 5         | 2947.317572  | 0.912047      |
| 6         | 3531.875185  | 0.847991      |
| 7         | 4123.851269  | 0.792307      |
| 8         | 4701.408431  | 0.744160      |
| 9         | 5289.000911  | 0.702352      |
| 10        | 5880.336303  | 0.666009      |
| 11        | 6470.943582  | 0.634375      |
| 12        | 7070.005017  | 0.607974      |
| 13        | 7645.386498  | 0.585214      |
| 14        | 8231.949556  | 0.565952      |
| 15        | 8846.238127  | 0.550032      |
| 16        | 9447.344265  | 0.536394      |
| 17        | 10079.749252 | 0.524457      |
| 18        | 10686.831090 | 0.514907      |
| 19        | 11274.567132 | 0.506526      |
| 20        | 11884.941462 | 0.499422      |
| 21        | 12468.578234 | 0.493876      |
| 22        | 13087.783623 | 0.489125      |
| 23        | 13697.648926 | 0.484530      |
| 24        | 14310.710548 | 0.480636      |
| 25        | 14934.567692 | 0.477712      |
| 26        | 15553.848294 | 0.475085      |
| 27        | 16140.388482 | 0.473122      |
| 28        | 16749.566229 | 0.471158      |
| 29        | 17344.942468 | 0.469647      |
| 30        | 17956.321740 | 0.468224      |
| 31        | 18582.508245 | 0.467053      |
| 32        | 19192.379800 | 0.465907      |
| 33        | 19790.628060 | 0.464756      |
| 34        | 20387.362442 | 0.463801      |
| 35        | 20977.960389 | 0.463078      |
| 36        | 21599.855335 | 0.462385      |
| 37        | 22198.393052 | 0.461827      |
| 38        | 22825.912133 | 0.461234      |
| 39        | 23429.038983 | 0.460773      |
| 40        | 24065.745940 | 0.460155      |
| 41        | 24687.137883 | 0.459724      |
| 42        | 25284.781488 | 0.459441      |
| 43        | 25883.957927 | 0.458928      |
| 44        | 26478.585104 | 0.458636      |
| 45        | 27101.805919 | 0.458412      |
| 46        | 27695.050958 | 0.458183      |
| 47        | 28325.048194 | 0.457782      |
| 48        | 28942.742833 | 0.457491      |
| 49        | 29555.702793 | 0.457204      |
| 50        | 30176.346106 | 0.457082      |
| 51        | 30794.452068 | 0.456749      |
| 52        | 31389.604362 | 0.456648      |
| 53        | 31995.609339 | 0.456410      |
| 54        | 32618.672110 | 0.456174      |
| 55        | 33235.459798 | 0.455939      |
| 56        | 33847.627878 | 0.455816      |
| 57        | 34454.303135 | 0.455545      |
| 58        | 35049.221535 | 0.455413      |
| 59        | 35645.277093 | 0.455223      |
| 60        | 36226.157430 | 0.455108      |
| 61        | 36854.583829 | 0.454967      |
| 62        | 37450.506451 | 0.454801      |
| 63        | 38057.659125 | 0.454601      |
| 64        | 38681.779476 | 0.454491      |
| 65        | 39293.341074 | 0.454326      |
| 66        | 39926.123288 | 0.454171      |
| 67        | 40543.841929 | 0.453984      |
| 68        | 41159.754031 | 0.453736      |
| 69        | 41749.475985 | 0.453656      |
| 70        | 42356.859003 | 0.453395      |
| 71        | 42952.143154 | 0.453213      |
| 72        | 43541.779964 | 0.453096      |
| 73        | 44162.177546 | 0.452964      |
| 74        | 44790.121021 | 0.452780      |
| 75        | 45413.964257 | 0.452662      |
| 76        | 46019.665472 | 0.452550      |
| 77        | 46626.904009 | 0.452397      |
| 78        | 47234.102840 | 0.452306      |
| 79        | 47854.086393 | 0.452188      |
| 80        | 48470.251981 | 0.452068      |
| 81        | 49080.315144 | 0.451972      |
| 82        | 49680.958416 | 0.451778      |
| 83        | 50302.588890 | 0.451659      |
| 84        | 50897.670130 | 0.451603      |
| 85        | 51508.161311 | 0.451425      |
| 86        | 52119.838721 | 0.451267      |
| 87        | 52736.131833 | 0.451189      |
| 88        | 53347.135939 | 0.450994      |
| 89        | 53960.255755 | 0.450911      |
| 90        | 54575.373317 | 0.450816      |
| 91        | 55199.571454 | 0.450751      |
| 92        | 55809.768469 | 0.450693      |
| 93        | 56419.315046 | 0.450540      |
| 94        | 57016.334776 | 0.450403      |
| 95        | 57640.192859 | 0.450247      |
| 96        | 58256.519844 | 0.450044      |
| 97        | 58845.781191 | 0.450018      |
| 98        | 59477.200522 | 0.449888      |
| 99        | 60083.582079 | 0.449763      |
| 100       | 60697.985264 | 0.449645      |
Checkpointing to /home/zongyi/bimbo_data/model_checkpoint_100
| 101       | 61336.333509 | 0.449556      |
| 102       | 61932.404365 | 0.449496      |
| 103       | 62531.022287 | 0.449406      |
| 104       | 63133.045754 | 0.449317      |
| 105       | 63715.363490 | 0.449253      |
| 106       | 64311.828373 | 0.449163      |
| 107       | 64934.865371 | 0.449072      |
| 108       | 65526.410908 | 0.448999      |
| 109       | 66146.979760 | 0.448889      |
| 110       | 66741.066582 | 0.448837      |
| 111       | 67341.980913 | 0.448727      |
| 112       | 67961.489870 | 0.448656      |
| 113       | 68583.097097 | 0.448557      |
| 114       | 69185.195994 | 0.448487      |
| 115       | 69797.088495 | 0.448423      |
| 116       | 70398.374073 | 0.448325      |
| 117       | 71013.089513 | 0.448249      |
| 118       | 71623.400716 | 0.448157      |
| 119       | 72248.698328 | 0.448071      |
| 120       | 72841.982611 | 0.447975      |
| 121       | 73455.866168 | 0.447934      |
| 122       | 74079.774532 | 0.447888      |
| 123       | 74700.619369 | 0.447767      |
| 124       | 75316.520861 | 0.447672      |
| 125       | 75916.356529 | 0.447618      |
| 126       | 76528.960610 | 0.447501      |
| 127       | 77152.085864 | 0.447418      |
| 128       | 77766.129093 | 0.447360      |
| 129       | 78367.776875 | 0.447310      |
| 130       | 78969.092101 | 0.447255      |
| 131       | 79579.732656 | 0.447134      |
| 132       | 80198.076607 | 0.447053      |
| 133       | 80816.269200 | 0.446999      |
| 134       | 81432.412620 | 0.446903      |
| 135       | 82034.164122 | 0.446855      |
| 136       | 82640.177013 | 0.446785      |
| 137       | 83260.137051 | 0.446734      |
| 138       | 83876.302773 | 0.446681      |
| 139       | 84481.305418 | 0.446649      |
| 140       | 85075.515449 | 0.446588      |
| 141       | 85668.733325 | 0.446539      |
| 142       | 86264.585305 | 0.446464      |
| 143       | 86861.258422 | 0.446425      |
| 144       | 87449.254050 | 0.446379      |
| 145       | 88060.196180 | 0.446323      |
| 146       | 88669.049790 | 0.446213      |
| 147       | 89261.854507 | 0.446151      |
| 148       | 89851.103264 | 0.446107      |
| 149       | 90433.345252 | 0.446002      |
| 150       | 91028.427236 | 0.445944      |
| 151       | 91622.513501 | 0.445913      |
| 152       | 92226.076147 | 0.445879      |
| 153       | 92830.817749 | 0.445823      |
| 154       | 93415.195611 | 0.445767      |
| 155       | 94010.965041 | 0.445723      |
| 156       | 94615.647558 | 0.445671      |
| 157       | 95211.754439 | 0.445601      |
| 158       | 95808.752763 | 0.445560      |
| 159       | 96399.587330 | 0.445465      |
| 160       | 96988.698285 | 0.445393      |
| 161       | 97561.623558 | 0.445341      |
| 162       | 98134.223300 | 0.445284      |
| 163       | 98740.299136 | 0.445211      |
| 164       | 99347.885442 | 0.445163      |
| 165       | 99952.715350 | 0.445087      |
| 166       | 100541.698...| 0.445037      |
| 167       | 101125.365...| 0.445008      |
| 168       | 101741.459...| 0.444934      |
| 169       | 102329.635...| 0.444856      |
| 170       | 102947.183...| 0.444805      |
| 171       | 103529.690...| 0.444746      |
| 172       | 104126.991...| 0.444707      |
| 173       | 104723.519...| 0.444654      |
| 174       | 105334.493...| 0.444610      |
| 175       | 105939.617...| 0.444579      |
| 176       | 106527.687...| 0.444553      |
| 177       | 107102.709...| 0.444509      |
| 178       | 107714.846...| 0.444451      |
| 179       | 108301.198...| 0.444401      |
| 180       | 108904.852...| 0.444347      |
| 181       | 109519.345...| 0.444280      |
| 182       | 110122.747...| 0.444245      |
| 183       | 110706.763...| 0.444155      |
| 184       | 111295.153...| 0.444097      |
| 185       | 111915.642...| 0.444051      |
| 186       | 112525.897...| 0.443998      |
| 187       | 113134.002...| 0.443976      |
| 188       | 113737.639...| 0.443909      |
| 189       | 114331.684...| 0.443890      |
| 190       | 114924.617...| 0.443867      |
| 191       | 115539.858...| 0.443810      |
| 192       | 116123.718...| 0.443773      |
| 193       | 116730.599...| 0.443732      |
| 194       | 117332.990...| 0.443690      |
| 195       | 117939.798...| 0.443659      |
| 196       | 118535.466...| 0.443640      |
| 197       | 119147.642...| 0.443575      |
| 198       | 119743.016...| 0.443534      |
| 199       | 120360.785...| 0.443482      |
| 200       | 120956.557...| 0.443446      |
Checkpointing to /home/zongyi/bimbo_data/model_checkpoint_200
| 201       | 121564.926...| 0.443426      |
| 202       | 122179.637...| 0.443395      |
| 203       | 122812.592...| 0.443340      |
| 204       | 123404.992...| 0.443301      |
| 205       | 124030.600...| 0.443234      |
| 206       | 124664.355...| 0.443204      |
| 207       | 125251.483...| 0.443144      |
| 208       | 125839.588...| 0.443114      |
| 209       | 126468.753...| 0.443060      |
| 210       | 127080.524...| 0.443008      |
| 211       | 127703.462...| 0.442975      |
| 212       | 128289.428...| 0.442945      |
| 213       | 128906.176...| 0.442901      |
| 214       | 129490.354...| 0.442876      |
| 215       | 130114.744...| 0.442849      |
| 216       | 130720.179...| 0.442832      |
| 217       | 131321.186...| 0.442790      |
| 218       | 131921.919...| 0.442750      |
| 219       | 132522.902...| 0.442711      |
| 220       | 133143.226...| 0.442682      |
| 221       | 133743.015...| 0.442654      |
| 222       | 134326.159...| 0.442628      |
| 223       | 134949.344...| 0.442589      |
| 224       | 135557.868...| 0.442544      |
| 225       | 136172.146...| 0.442504      |
| 226       | 136784.813...| 0.442472      |
| 227       | 137405.815...| 0.442442      |
| 228       | 138024.578...| 0.442419      |
| 229       | 138638.996...| 0.442377      |
| 230       | 139233.856...| 0.442353      |
| 231       | 139830.229...| 0.442300      |
| 232       | 140441.602...| 0.442269      |
| 233       | 141046.911...| 0.442240      |
| 234       | 141665.039...| 0.442213      |
| 235       | 142260.481...| 0.442183      |
| 236       | 142874.277...| 0.442156      |
| 237       | 143473.995...| 0.442107      |
| 238       | 144087.229...| 0.442093      |
| 239       | 144688.342...| 0.442059      |
| 240       | 145293.918...| 0.442019      |
| 241       | 145923.452...| 0.441978      |
| 242       | 146541.931...| 0.441944      |
| 243       | 147142.128...| 0.441895      |
| 244       | 147782.670...| 0.441854      |
| 245       | 148402.958...| 0.441810      |
| 246       | 149013.096...| 0.441765      |
| 247       | 149624.820...| 0.441715      |
| 248       | 150266.944...| 0.441664      |
| 249       | 150863.596...| 0.441644      |
| 250       | 151479.802...| 0.441611      |
| 251       | 152059.709...| 0.441596      |
| 252       | 152675.315...| 0.441543      |
| 253       | 153277.927...| 0.441512      |
| 254       | 153879.988...| 0.441469      |
| 255       | 154483.783...| 0.441439      |
| 256       | 155107.930...| 0.441408      |
| 257       | 155686.834...| 0.441394      |
| 258       | 156308.949...| 0.441364      |
| 259       | 156930.462...| 0.441323      |
| 260       | 157529.918...| 0.441293      |
| 261       | 158145.546...| 0.441265      |
| 262       | 158754.823...| 0.441217      |

In [31]:
print results


{'max_error': 6.058466911315918, 'rmse': 0.45040038110628317}

In [59]:
train_rmse = model.evaluate(train)
print train_rmse


{'max_error': 6.358242511749268, 'rmse': 0.4470555577201293}

In [36]:
model.summary()


Class                          : BoostedTreesRegression

Schema
------
Number of examples             : 17797989
Number of feature columns      : 21
Number of unpacked features    : 21

Settings
--------
Number of trees                : 200
Max tree depth                 : 10
Training time (sec)            : 11956.8374
Training rmse                  : 0.4465
Validation rmse                : 0.4507


In [ ]:


In [38]:
test = gl.SFrame.read_csv(path + 'test_lag5.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
del test['Town']
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)

In [41]:
ids = test['id']

In [44]:
del test['id']

In [45]:
demand_log = model.predict(test)

In [49]:
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})

In [50]:
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: expm1(math.max(0, x)))

In [54]:
sub = sub.sort('id')

In [55]:
sub


Out[55]:
Demanda_uni_equil id
3.0620377682 0
1.21354618384 1
1.94959367403 2
1.23797419459 3
0.882530603051 4
2.78527065495 5
2.00794776168 6
4.68473360299 7
5.94600461232 8
6.09402805545 9
[6999251 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [56]:
sub.save(path+'gbrt_sub2.csv',format='csv')

In [ ]: