In [1]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
In [2]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
In [3]:
train = gl.SFrame.read_csv(path + 'train_fs_w9.csv', verbose=False)
In [4]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
town = town['Agencia_ID','Producto_ID','tcc']
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# train = train.fillna('t_c',1)
train = train.fillna('tcc',0)
# train = train.fillna('tp_sum',0)
del town
In [7]:
# relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
# train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
# train = train.fillna('re_lag1',0)
# train = train.fillna('re_lag2',0)
# train = train.fillna('re_lag3',0)
# train = train.fillna('re_lag4',0)
# train = train.fillna('re_lag5',0)
# del relag_train
In [8]:
# pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
# train = train.join(pd, on=['Producto_ID'], how='left')
# train = train.fillna('prom',0)
# train = train.fillna('weight',0)
# train = train.fillna('pieces',1)
# train = train.fillna('w_per_piece',0)
# train = train.fillna('healthy',0)
# train = train.fillna('drink',0)
# del train['brand']
# del train['NombreProducto']
# del pd
In [9]:
# client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
# train = train.join(client, on=['Cliente_ID'], how='left')
# del client
In [5]:
del train['prior_sum']
del train['lag_sum']
# del train['week_times']
# del train['Semana']
In [6]:
del train['n_t']
In [7]:
print train.column_names()
print len(train.column_names())
In [8]:
# Make a train-test split
# train_data, test_data = train.random_split(0.999)
# Create a model.
model = gl.boosted_trees_regression.create(train, target='Demada_log',
step_size=0.1,
max_iterations=1500,
max_depth = 10,
metric='rmse',
random_seed=998,
column_subsample=0.5,
row_subsample=0.85,
validation_set=None,
model_checkpoint_path=path,
model_checkpoint_interval=1500)
In [18]:
# '''resume_from_checkpoint'''
# train_data, test_data = train.random_split(0.999)
# model = gl.boosted_trees_regression.create(train_data, target='Demada_log',
# step_size=0.1,
# max_iterations=1000,
# max_depth = 10,
# metric='rmse',
# random_seed=461,
# column_subsample=0.75,
# row_subsample=0.85,
# validation_set=test_data,
# resume_from_checkpoint=path+'model_checkpoint_1000_w8',
# model_checkpoint_path=path,
# model_checkpoint_interval=1000)
In [9]:
'''feature important'''
w = model.get_feature_importance()
w = w.add_row_number()
w
Out[9]:
In [32]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'})
%matplotlib inline
figsize(12, 6)
plt.bar(w['id'], w['count'], tick_label=w['name'])
plt.xticks(rotation=45)
Out[32]:
In [23]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'})
%matplotlib inline
figsize(16, 6)
plt.scatter(model.progress['Iteration'], model.progress['Training-rmse'],alpha=.5)
plt.ylim(.4,.5)
# plt.xticks(rotation=45)
Out[23]:
In [16]:
# Save predictions to an SArray
# predictions = model.predict(train)
# Evaluate the model and save the results into a dictionary
# results = model.evaluate(train)
# print results
In [24]:
test = gl.SFrame.read_csv(path + 'test_fs_w9.csv', verbose=False)
# test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
# del test['Town']
# test = test.fillna('t_c',1)
# test = test.fillna('tcc',0)
# test = test.fillna('tp_sum',0)
In [26]:
del test['Canal_ID']
del test['lag_sum']
del test['prior_sum']
del test['n_t']
del test['prom']
del test['brand']
del test['healthy']
del test['drink']
In [27]:
print test.column_names()
print len(test.column_names())
In [28]:
'''Add feature to week 11'''
def feature_w11(test, lag_sum=0, prior_sum=0):
test_full = test.copy()
ids = test['id']
del test['id']
del test['Semana']
demand_log = model.predict(test)
sub1 = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
test_full = test_full.join(sub1,on=['id'],how='left')
lag11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag11':agg.MEAN('Demanda_uni_equil')})
lag11['Semana'] = lag11['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag11',0)
test_full['lag1'] = test_full['lag1'] + test_full['lag11']
if lag_sum == 1:
test_full['lag_sum'] = test_full['lag_sum'] + test_full['lag11']
if prior_sum == 1:
lag_sum11 = test_full.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag_sum11':agg.SUM('Demanda_uni_equil')})
lag_sum11['Semana'] = lag_sum11['Semana'].apply(lambda x: x+1)
test_full = test_full.join(lag_sum11,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test_full = test_full.fillna('lag_sum11',0)
test_full['prior_sum'] = test_full['prior_sum'] + test_full['lag_sum11']
del test_full['lag_sum11']
del test_full['lag11']
del test_full['Demanda_uni_equil']
return test_full
In [29]:
test_full = feature_w11(test, lag_sum=0, prior_sum=0)
ids = test_full['id']
del test_full['id']
del test_full['Semana']
demand_log = model.predict(test_full)
sub = gl.SFrame({'id':ids,'Demanda_uni_equil':demand_log})
In [30]:
import math
sub['Demanda_uni_equil'] = sub['Demanda_uni_equil'].apply(lambda x: math.expm1(max(0, x)))
In [31]:
file_name = 'w9'+'_f'+str(model.num_features)+'_n'+str(model.max_iterations)+'_c'+str(model.column_subsample)
sub.save(path + file_name,format='csv')
In [56]:
sub
Out[56]: