In [24]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
In [34]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
test = gl.SFrame.read_csv(path + 'test.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)
In [3]:
# '''MAC'''
# path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
# sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)
In [35]:
sf = sf.add_row_number()
sf['Demada_log'] = sf['Demanda_uni_equil'].apply(lambda x: math.log(x+1))
# train = sf[sf['Semana']>7].copy()
In [ ]:
'''train'''
lag = sf.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demada_log')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
train.rename({'lag':'lag1','lag.1':'lag2','lag.2':'lag3','lag.3':'lag4','lag.4':'lag5'})
In [4]:
'''test'''
lag = sf.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demada_log')})
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test.rename({'lag':'lag1','lag.1':'lag2','lag.2':'lag3','lag.3':'lag4','lag.4':'lag5'})
Out[4]:
In [5]:
del lag
In [ ]:
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
train['lag_sum'] = train['lag1'] + train['lag2'] + train['lag3'] + train['lag4'] + train['lag5']
In [ ]:
sum8 = sf[sf['Semana']<8]
sum8 = sum8.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum8['Semana']=8
sum9 = sf[(sf['Semana']<9)&(sf['Semana']>3)]
sum9 = sum9.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum9['Semana']=9
sum89 = sum8.append(sum9)
train = train.join(sum89,on=['Semana','Cliente_ID','Producto_ID'],how='left')
train.fillna('prior_sum',0)
In [ ]:
del sum8
del sum9
del sum89
In [6]:
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test['lag_sum'] = test['lag1'] + test['lag2'] + test['lag3'] + test['lag4'] + test['lag5']
In [7]:
# sum8 = sf[sf['Semana']<8]
sum10 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum10['Semana']=10
# sum9 = sf[sf['Semana']<9]
sum11 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum11['Semana']=11
sum1011 = sum10.append(sum11)
test = test.join(sum1011,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test = test.fillna('prior_sum',0)
In [8]:
del sum10
del sum11
del sum1011
In [9]:
def count_append(row):
a=1
for i in range(len(row['week_times_list'])):
if row['week_times_list'][i] < row['id']:
a=a+1
return a
In [ ]:
counts = train.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'week_times_list':agg.CONCAT('id')})
train = train.join(counts,on=['Semana','Cliente_ID','Producto_ID'],how='left')
del counts
train['week_times'] = train.apply(count_append)
del train['week_times_list']
In [10]:
counts = test.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'week_times_list':agg.CONCAT('id')})
test = test.join(counts,on=['Semana','Cliente_ID','Producto_ID'],how='left')
del counts
test['week_times'] = test.apply(count_append)
del test['week_times_list']
In [36]:
# test = gl.SFrame.read_csv(path + 'test.csv', verbose=False)
sf = sf[sf['Semana']==9]
del sf['Venta_uni_hoy']
del sf['Venta_hoy']
del sf['Dev_uni_proxima']
del sf['Dev_proxima']
del sf['Demanda_uni_equil']
del sf['Demada_log']
del sf['id']
del test['id']
sf = sf.append(test)
In [37]:
n_a = sf.groupby(key_columns=['Semana','Agencia_ID'], operations={'n_a':agg.COUNT('Agencia_ID')})
n_a = n_a.groupby(key_columns=['Agencia_ID'], operations={'n_a':agg.MEAN('n_a')})
n_r = sf.groupby(key_columns=['Semana','Ruta_SAK'], operations={'n_r':agg.COUNT('Ruta_SAK')})
n_r = n_r.groupby(key_columns=['Ruta_SAK'], operations={'n_r':agg.MEAN('n_r')})
n_c = sf.groupby(key_columns=['Semana','Cliente_ID'], operations={'n_c':agg.COUNT('Cliente_ID')})
n_c = n_c.groupby(key_columns=['Cliente_ID'], operations={'n_c':agg.MEAN('n_c')})
n_p = sf.groupby(key_columns=['Semana','Producto_ID'], operations={'n_p':agg.COUNT('Producto_ID')})
n_p = n_p.groupby(key_columns=['Producto_ID'], operations={'n_p':agg.MEAN('n_p')})
In [9]:
train = gl.SFrame.read_csv(path + 'train_lag5.csv', verbose=False)
del train['n_a']
del train['n_p']
del train['n_c']
del train['n_r']
In [10]:
train = train[train['Semana']==8]
In [11]:
train = train.join(n_a,on=['Agencia_ID'],how='left')
train = train.join(n_r,on=['Ruta_SAK'],how='left')
train = train.join(n_c,on=['Cliente_ID'],how='left')
train = train.join(n_p,on=['Producto_ID'],how='left')
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)
In [38]:
test = gl.SFrame.read_csv(path + 'test_lag5.csv', verbose=False)
del test['n_a']
del test['n_p']
del test['n_c']
del test['n_r']
In [39]:
test = test.join(n_a,on=['Agencia_ID'],how='left')
test = test.join(n_r,on=['Ruta_SAK'],how='left')
test = test.join(n_c,on=['Cliente_ID'],how='left')
test = test.join(n_p,on=['Producto_ID'],how='left')
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)
In [40]:
test
Out[40]:
In [13]:
del n_a
del n_r
del n_c
del n_p
In [45]:
len(test[test['lag_sum']==test['prior_sum']])
Out[45]:
In [12]:
train.save(path+'train_lag5_w8.csv',format='csv')
In [41]:
test.save(path+'test_lag5_w9.csv',format='csv')
In [ ]:
In [4]:
'''prior sum for week8'''
sf = sf[sf['Semana']>4]
sum10 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum10['Semana']=10
sf = sf[sf['Semana']>5]
sum11 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum11['Semana']=11
sum1011 = sum10.append(sum11)
In [22]:
test = gl.SFrame.read_csv(path + 'test_w9_fs.csv', verbose=False)
print test
In [23]:
del test['prior_sum']
test = test.join(sum1011,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test = test.fillna('prior_sum',0)
test.save(path+'test_fs.csv',format='csv')
In [ ]:
In [ ]:
In [5]:
test = gl.SFrame.read_csv(path + 'test_lag5_w8.csv', verbose=False)
In [7]:
test = test.fillna('prior_sum.1',0)
In [10]:
test['prior_sum']=test['prior_sum.1']
In [12]:
del test['prior_sum.1']
In [14]:
test.save(path+'test_lag5_w8.csv',format='csv')
In [ ]: