In [2]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
from graphlab import mxnet as mx
In [3]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
In [13]:
train = gl.SFrame.read_csv(path + 'train_lag5_w9_mean.csv', verbose=False)
In [ ]:
In [ ]:
# Define the network symbol, equivalent to linear regression
net = mx.symbol.Variable('data')
net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=1)
net = mx.symbol.LinearRegressionOutput(data=net, name='lr')
# Load data into SFrame and normalize features
# sf = gl.SFrame.read_csv('https://static.turi.com/datasets/regression/houses.csv')
# features = ['tax', 'bedroom', 'bath', 'size', 'lot']
features = ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'lag1','lag2','lag3','lag4','lag5','n_a','n_r','n_c','n_p','mpca']
# for f in features:
# sf[f] = sf[f] - sf[f].mean()
# sf[f] = sf[f] / sf[f].std()
# Prepare the input iterator from SFrame
# `data_name` must match the first layer's name of the network.
# `label_name` must match the last layer's name plus "_label".
dataiter = mx.io.SFrameIter(train, data_field=features, label_field='Demada_log',
data_name='data', label_name='lr_label',
batch_size=1)
# Train the network
model = mx.model.FeedForward.create(symbol=net, X=dataiter, num_epoch=20,
learning_rate=1e-2,
eval_metric='rmse')
# Make prediction
model.predict(dataiter)
In [4]:
# town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
# train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
# train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
# del train['Town']
# del train['t_c']
In [14]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']
In [15]:
del train['prior_sum']
del train['lag_sum']
del train['week_times']
del train['Semana']
del train['Canal_ID']
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
# train = train.fillna('lag_sum',0)
# train = train.fillna('prior_sum',0)
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
train
Out[15]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [5]:
# Make a train-test split
train_data, test_data = train.random_split(0.99,seed=788)
# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
step_size=0.1,
max_iterations=500,
max_depth = 10,
metric='rmse',
random_seed=78,
column_subsample=0.6,
row_subsample=0.85,
validation_set=test_data,
model_checkpoint_path=path,
model_checkpoint_interval=500,
resume_from_checkpoint=path+'model_checkpoint_1000_w9')
#500 | 14060.836344 | 0.435407 | 0.447519
In [43]:
w = model.get_feature_importance()
w = w.add_row_number()
w
Out[43]:
In [44]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'})
%matplotlib inline
figsize(16, 9)
plt.bar(w['id'], w['count'], tick_label=w['name'])
plt.xticks(rotation=45)
Out[44]:
In [20]:
# Save predictions to an SArray
predictions = model.predict(train)
# Evaluate the model and save the results into a dictionary
results = model.evaluate(train)
print results
In [88]:
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
test_full = test.copy()
ids = test['id']
del test['id']
del test['Semana']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag11',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag11']
if lag_sum == 1:
test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
if prior_sum == 1:
lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag_sum11',0)
test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
del test_full['lag_sum11']
del test_full['lag11']
del test_full['Demanda_uni_equil']
del test_full['Semana']
return test_full
In [87]:
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
del test['Town']
del test['t_c']
del test['n_t']
In [89]:
# del test['prior_sum']
# del test['lag_sum']
del test['week_times']
# del test['Semana']
del test['Canal_ID']
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test = test.fillna('lag_sum',0)
test = test.fillna('prior_sum',0)
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
print test.head()
In [ ]:
test_full = feature_w11(test, lag_sum=1, prior_sum=1)
In [64]:
test_full = test.copy()
ids = test['id']
del test['id']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demanda_uni_equil')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag']
del test_full['lag']
del test_full['Demanda_uni_equil']
In [65]:
In [66]:
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
In [69]:
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))
In [80]:
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path+file_name,format='csv')
In [70]:
sub
Out[70]: