In [2]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray
In [2]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
train = gl.SFrame.read_csv(path + 'train_lag5_w8.csv', verbose=False)
In [3]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
train = train.join(town, on=['Agencia_ID','Producto_ID'], how='left')
train = train.fillna('t_c',1)
train = train.fillna('tcc',0)
train = train.fillna('tp_sum',0)
train = train.fillna('n_t',0)
# del train['Town']
In [4]:
del train['id']
del train['Venta_uni_hoy']
del train['Venta_hoy']
del train['Dev_uni_proxima']
del train['Dev_proxima']
del train['Demanda_uni_equil']
In [5]:
relag_train = gl.SFrame.read_csv(path + 're_lag_train.csv', verbose=False)
train = train.join(relag_train, on=['Cliente_ID','Producto_ID','Semana'], how='left')
train = train.fillna('re_lag1',0)
train = train.fillna('re_lag2',0)
train = train.fillna('re_lag3',0)
train = train.fillna('re_lag4',0)
train = train.fillna('re_lag5',0)
train['re_sum'] = (train['re_lag1'] + train['re_lag2'] + train['re_lag3'] + train['re_lag4'] + train['re_lag5'])/5
del relag_train
In [6]:
pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
train = train.join(pd, on=['Producto_ID'], how='left')
train = train.fillna('prom',0)
train = train.fillna('weight',0)
train = train.fillna('pieces',1)
train = train.fillna('w_per_piece',0)
train = train.fillna('healthy',0)
train = train.fillna('drink',0)
del pd
In [7]:
client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
train = train.join(client, on=['Cliente_ID'], how='left')
del client
In [8]:
del train['Semana']
del train['Canal_ID']
# del train['tcc']
del train['re_lag1']
del train['re_lag2']
del train['re_lag3']
del train['re_lag4']
del train['re_lag5']
del train['prom']
del train['healthy']
del train['drink']
del train['brand']
# del train['week_times']
In [9]:
print train.column_names()
print len(train.column_names())
In [10]:
train.save(path+'train_fs_w8.csv',format='csv')
In [ ]:
In [32]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'grid.color': '.8','grid.linestyle': u'--'})
%matplotlib inline
figsize(12, 6)
plt.bar(w['id'], w['count'], tick_label=w['name'])
plt.xticks(rotation=45)
Out[32]:
In [ ]:
In [ ]:
In [17]:
path = '/home/zongyi/bimbo_data/'
test = gl.SFrame.read_csv(path + 'test_lag5_w9.csv', verbose=False)
In [18]:
town = gl.SFrame.read_csv(path + 'towns.csv', verbose=False)
test = test.join(town, on=['Agencia_ID','Producto_ID'], how='left')
test = test.fillna('t_c',1)
test = test.fillna('tcc',0)
test = test.fillna('tp_sum',0)
test = test.fillna('n_t',0)
# del test['Town']
In [19]:
relag_test = gl.SFrame.read_csv(path + 're_lag_test.csv', verbose=False)
test = test.join(relag_test, on=['Cliente_ID','Producto_ID','Semana'], how='left')
test = test.fillna('re_lag1',0)
test = test.fillna('re_lag2',0)
test = test.fillna('re_lag3',0)
test = test.fillna('re_lag4',0)
test = test.fillna('re_lag5',0)
def f(x):
if x['Semana']==10:
a = (x['re_lag1'] + x['re_lag2'] + x['re_lag3'] + x['re_lag4'] + x['re_lag5'])/5
if x['Semana']==11:
a = (x['re_lag2'] + x['re_lag3'] + x['re_lag4'] + x['re_lag5'])/4
return a
test['re_sum'] = test[['Semana','re_lag1', 're_lag2', 're_lag3','re_lag4','re_lag5']].apply(f)
# test['re_sum'] = (test['re_lag1'] + test['re_lag2'] + test['re_lag3'] + test['re_lag4'] + test['re_lag5'])/5
del test['re_lag1']
del test['re_lag2']
del test['re_lag3']
del test['re_lag4']
del test['re_lag5']
In [20]:
pd = gl.SFrame.read_csv(path + 'products.csv', verbose=False)
test = test.join(pd, on=['Producto_ID'], how='left')
test = test.fillna('prom',0)
test = test.fillna('weight',0)
test = test.fillna('pieces',1)
test = test.fillna('w_per_piece',0)
test = test.fillna('healthy',0)
test = test.fillna('drink',0)
del pd
In [21]:
client = gl.SFrame.read_csv(path + 'clients.csv', verbose=False)
test = test.join(client, on=['Cliente_ID'], how='left')
del client
In [28]:
# del test['Semana']
del test['Canal_ID']
# del test['tcc']
# del test['re_lag1']
# del test['re_lag2']
# del test['re_lag3']
# del test['re_lag4']
# del test['re_lag5']
del test['prom']
del test['healthy']
del test['drink']
del test['brand']
In [22]:
print test.column_names()
print len(test.column_names())
In [23]:
test.save(path+'test_fs_w9.csv',format='csv')
In [ ]:
In [ ]: