In [1]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
In [2]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
In [4]:
train = gl.SFrame.read_csv(path + 'train_lag5.csv', verbose=False)
In [8]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
In [12]:
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
In [9]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']
del train['Town']
In [ ]:
rl_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
clt = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
In [68]:
cluster = gl.SFrame.read_csv(path + 'prod_cluster.csv', verbose=False)
In [70]:
cluster = cluster[['Producto_ID','cluster']]
train = train.join(cluster, on=['Producto_ID'], how='left')
In [66]:
train = train.join(rl_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
train = train.join(pd, on=['Producto_ID'], how='left')
train = train.join(clt, on=['Cliente_ID'], how='left')
In [71]:
train = train.fillna('re_lag1',0)
train = train.fillna('re_lag2',0)
train = train.fillna('re_lag3',0)
train = train.fillna('re_lag4',0)
train = train.fillna('re_lag5',0)
train = train.fillna('prom',0)
train = train.fillna('weight',0)
train = train.fillna('pieces',1)
train = train.fillna('w_per_piece',0)
train = train.fillna('healthy',0)
train = train.fillna('drink',0)
del train['brand']
del train['NombreProducto']
del rl_train
del pd
del clt
In [73]:
train
Out[73]:
In [ ]:
# Make a train-test split
# train_data, test_data = train.random_split(0.9)
# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
step_size=0.1,
max_iterations=500,
max_depth = 10,
metric='rmse',
random_seed=321,
column_subsample=0.7,
row_subsample=0.85,
validation_set=None,
model_checkpoint_path=path,
model_checkpoint_interval=100)
# Save predictions to an SArray
predictions = model.predict(train)
# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
In [31]:
print results
In [59]:
train_rmse = model.evaluate(train)
print train_rmse
In [36]:
model.summary()
In [ ]:
In [38]:
test = gl.SFrame.read_csv(path + 'test_lag5.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
del test['Town']
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
In [41]:
ids = test['id']
In [44]:
del test['id']
In [45]:
demand_log = model.predict(test)
In [49]:
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
In [50]:
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: expm1(math.max(0, x)))
In [54]:
sub = sub.sort('id')
In [55]:
sub
Out[55]:
In [56]:
sub.save(path+'gbrt_sub2.csv',format='csv')
In [ ]: