In [3]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg

In [4]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)

In [2]:
'''MAC'''
path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)

In [5]:
sf['return_rate'] = sf['Dev_uni_proxima'] / ( sf['Dev_uni_proxima'] + sf['Demanda_uni_equil'] )
re_lag = sf.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'re_lag':agg.MEAN('return_rate')})

# re_lag['Semana'] = re_lag['Semana'] + 1

In [6]:
lag = re_lag.copy()
re_lag.remove_column('re_lag')

lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
re_lag = re_lag.join(lag,on=['Cliente_ID','Producto_ID','Semana'],how='outer')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
re_lag = re_lag.join(lag,on=['Cliente_ID','Producto_ID','Semana'],how='outer')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
re_lag = re_lag.join(lag,on=['Cliente_ID','Producto_ID','Semana'],how='outer')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
re_lag = re_lag.join(lag,on=['Cliente_ID','Producto_ID','Semana'],how='outer')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
re_lag = re_lag.join(lag,on=['Cliente_ID','Producto_ID','Semana'],how='outer')
# re_lag.rename({'re_lag':'re_lag1','re_lag.1':'re_lag2','re_lag.2':'re_lag3','re_lag.3':'re_lag4','re_lag.4':'re_lag5'})

In [13]:
re_lag.rename({'re_lag':'re_lag1','re_lag.1':'re_lag2','re_lag.2':'re_lag3','re_lag.3':'re_lag4','re_lag.4':'re_lag5'})


Out[13]:
Cliente_ID Producto_ID Semana re_lag1 re_lag2 re_lag3 re_lag4 re_lag5
18648 2233 5 0.0 0.0 None None None
2323868 48417 4 0.0 None None None None
538561 32819 4 0.0 None None None None
2102008 34054 4 0.142857142857 None None None None
503038 3270 5 0.0 0.0 None None None
1460981 2233 4 0.0 None None None None
4669623 43207 4 0.0 None None None None
181950 30575 4 0.0 None None None None
2497667 42434 5 0.0 None None None None
2209976 41938 4 0.0 None None None None
[217171542 rows x 8 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [14]:
re_train=re_lag[(re_lag['Semana']>5)&(re_lag['Semana']<10)]

In [15]:
re_train.save(path+'re_lag_train.csv',format='csv')

In [16]:
re_test=re_lag[(re_lag['Semana']>9)&(re_lag['Semana']<12)]

In [17]:
re_test.save(path+'re_lag_test.csv',format='csv')