In [24]:
import pandas as pd
import numpy as np
import os
import math
import graphlab
import graphlab as gl
import graphlab.aggregate as agg
from graphlab import SArray

In [34]:
'''钢炮'''
path = '/home/zongyi/bimbo_data/'
sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
test = gl.SFrame.read_csv(path + 'test.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)

In [3]:
# '''MAC'''
# path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
# sf = gl.SFrame.read_csv(path + 'train.csv', verbose=False)
# town = gl.SFrame.read_csv(path + 'town_state.csv', verbose=False)

In [35]:
sf = sf.add_row_number()
sf['Demada_log'] = sf['Demanda_uni_equil'].apply(lambda x: math.log(x+1))
# train = sf[sf['Semana']>7].copy()

lag5


In [ ]:
'''train'''
lag = sf.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demada_log')})

lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
train = train.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')

train.rename({'lag':'lag1','lag.1':'lag2','lag.2':'lag3','lag.3':'lag4','lag.4':'lag5'})

In [4]:
'''test'''
lag = sf.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'lag':agg.MEAN('Demada_log')})

lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')
lag['Semana'] = lag['Semana'].apply(lambda x: x+1)
test = test.join(lag,on=['Semana','Cliente_ID','Producto_ID'],how='left')

test.rename({'lag':'lag1','lag.1':'lag2','lag.2':'lag3','lag.3':'lag4','lag.4':'lag5'})


Out[4]:
id Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID lag1 lag2
2824773 10 1138 1 1433 17206 32819 1.09861228867 1.38629436112
2154416 10 1693 11 3921 4527069 30531 5.20400668708 4.94875989038
545145 10 1347 1 2057 2493819 43069 1.79175946923 0.69314718056
2689716 10 2060 1 1204 1209040 2425 2.56494935746 2.83321334406
1910575 10 4040 1 2168 1092506 30569 1.60943791243 1.60943791243
4031765 10 1130 1 1406 230252 1284 2.99573227355 3.25809653802
4991054 11 1315 1 1273 4528250 1146 None 1.60943791243
6657368 10 2055 1 1157 4363818 972 1.09861228867 1.38629436112
211220 10 2653 1 1140 4397803 35651 2.07944154168 1.60943791243
4101441 10 1339 4 6602 4619018 1145 3.25809653802 3.25809653802
lag3 lag4 lag5
1.60943791243 1.38629436112 1.09861228867
4.04305126783 5.01727983681 1.94591014906
None None 1.38629436112
2.83321334406 2.63905732962 2.99573227355
1.60943791243 0.69314718056 1.94591014906
2.3978952728 1.94591014906 3.25809653802
0.69314718056 1.09861228867 1.38629436112
0.69314718056 0.69314718056 1.09861228867
1.60943791243 1.94591014906 1.09861228867
3.36729582999 2.30258509299 3.21887582487
[6999251 rows x 12 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [5]:
del lag

lag_sum and prior_sum


In [ ]:
train = train.fillna('lag1',0)
train = train.fillna('lag2',0)
train = train.fillna('lag3',0)
train = train.fillna('lag4',0)
train = train.fillna('lag5',0)
train['lag_sum'] = train['lag1'] + train['lag2'] + train['lag3'] + train['lag4'] + train['lag5']

In [ ]:
sum8 = sf[sf['Semana']<8]
sum8 = sum8.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum8['Semana']=8
sum9 = sf[(sf['Semana']<9)&(sf['Semana']>3)]
sum9 = sum9.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum9['Semana']=9
sum89 = sum8.append(sum9)
train = train.join(sum89,on=['Semana','Cliente_ID','Producto_ID'],how='left')
train.fillna('prior_sum',0)

In [ ]:
del sum8
del sum9
del sum89

In [6]:
test = test.fillna('lag1',0)
test = test.fillna('lag2',0)
test = test.fillna('lag3',0)
test = test.fillna('lag4',0)
test = test.fillna('lag5',0)
test['lag_sum'] = test['lag1'] + test['lag2'] + test['lag3'] + test['lag4'] + test['lag5']

In [7]:
# sum8 = sf[sf['Semana']<8]
sum10 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum10['Semana']=10
# sum9 = sf[sf['Semana']<9]
sum11 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum11['Semana']=11
sum1011 = sum10.append(sum11)
test = test.join(sum1011,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test = test.fillna('prior_sum',0)

In [8]:
del sum10
del sum11
del sum1011

week times


In [9]:
def count_append(row):
    a=1
    for i in range(len(row['week_times_list'])):
        if row['week_times_list'][i] < row['id']:
            a=a+1
    return a

In [ ]:
counts = train.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'week_times_list':agg.CONCAT('id')})
train = train.join(counts,on=['Semana','Cliente_ID','Producto_ID'],how='left')
del counts
train['week_times'] = train.apply(count_append)
del train['week_times_list']

In [10]:
counts = test.groupby(key_columns=['Semana','Cliente_ID','Producto_ID'], operations={'week_times_list':agg.CONCAT('id')})
test = test.join(counts,on=['Semana','Cliente_ID','Producto_ID'],how='left')
del counts
test['week_times'] = test.apply(count_append)
del test['week_times_list']

mean number of agencia ruta client product by week


In [36]:
# test = gl.SFrame.read_csv(path + 'test.csv', verbose=False)
sf = sf[sf['Semana']==9]
del sf['Venta_uni_hoy']
del sf['Venta_hoy']
del sf['Dev_uni_proxima']
del sf['Dev_proxima']
del sf['Demanda_uni_equil']
del sf['Demada_log']
del sf['id']
del test['id']
sf = sf.append(test)

In [37]:
n_a = sf.groupby(key_columns=['Semana','Agencia_ID'], operations={'n_a':agg.COUNT('Agencia_ID')})
n_a = n_a.groupby(key_columns=['Agencia_ID'], operations={'n_a':agg.MEAN('n_a')})
n_r = sf.groupby(key_columns=['Semana','Ruta_SAK'], operations={'n_r':agg.COUNT('Ruta_SAK')})
n_r = n_r.groupby(key_columns=['Ruta_SAK'], operations={'n_r':agg.MEAN('n_r')})
n_c = sf.groupby(key_columns=['Semana','Cliente_ID'], operations={'n_c':agg.COUNT('Cliente_ID')})
n_c = n_c.groupby(key_columns=['Cliente_ID'], operations={'n_c':agg.MEAN('n_c')})
n_p = sf.groupby(key_columns=['Semana','Producto_ID'], operations={'n_p':agg.COUNT('Producto_ID')})
n_p = n_p.groupby(key_columns=['Producto_ID'], operations={'n_p':agg.MEAN('n_p')})

In [9]:
train = gl.SFrame.read_csv(path + 'train_lag5.csv', verbose=False)
del train['n_a']
del train['n_p']
del train['n_c']
del train['n_r']

In [10]:
train = train[train['Semana']==8]

In [11]:
train = train.join(n_a,on=['Agencia_ID'],how='left')
train = train.join(n_r,on=['Ruta_SAK'],how='left')
train = train.join(n_c,on=['Cliente_ID'],how='left')
train = train.join(n_p,on=['Producto_ID'],how='left')
train = train.fillna('n_a',0)
train = train.fillna('n_r',0)
train = train.fillna('n_c',0)
train = train.fillna('n_p',0)

In [38]:
test = gl.SFrame.read_csv(path + 'test_lag5.csv', verbose=False)
del test['n_a']
del test['n_p']
del test['n_c']
del test['n_r']

In [39]:
test = test.join(n_a,on=['Agencia_ID'],how='left')
test = test.join(n_r,on=['Ruta_SAK'],how='left')
test = test.join(n_c,on=['Cliente_ID'],how='left')
test = test.join(n_p,on=['Producto_ID'],how='left')
test = test.fillna('n_a',0)
test = test.fillna('n_r',0)
test = test.fillna('n_c',0)
test = test.fillna('n_p',0)

In [40]:
test


Out[40]:
id Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID lag1 lag2 lag3
3462602 10 1366 1 1051 1368806 1146 1.09861 0.693147 0.693147
4115126 10 1462 1 1114 1892625 43202 1.60944 0.693147 1.09861
2268424 10 1339 1 1208 134395 3270 1.60944 1.38629 1.38629
1147412 10 1423 1 2112 1218112 31507 1.79176 1.79176 0.0
5404948 10 1339 1 1106 550459 1232 0.693147 0.693147 0.0
1792840 10 22187 1 1254 1316091 1146 0.693147 0.693147 0.693147
18723 10 1347 1 2076 2395834 43316 0.0 0.0 0.0
1073043 10 2034 1 1145 253861 1242 0.693147 1.60944 1.38629
470736 10 1222 1 2151 925514 35305 2.77259 2.77259 2.3979
1787383 10 2036 1 1028 1741600 1146 1.94591 1.79176 2.3979
lag4 lag5 lag_sum prior_sum week_times n_a n_r n_c n_p
1.38629 0.693147 4.56435 4.56435 1 10191.6666667 12811.0 7.66666666667 117552.666667
0.693147 1.38629 5.48064 5.48064 1 17206.3333333 16272.3333333 15.6666666667 26207.6666667
1.38629 1.38629 7.15462 7.15462 1 17208.0 27437.6666667 16.0 36871.6666667
0.0 1.79176 5.37528 5.37528 1 24501.6666667 14717.3333333 21.6666666667 42043.6666667
0.0 0.0 1.38629 1.38629 1 17208.0 20512.6666667 12.0 107057.0
1.09861 0.693147 3.8712 3.8712 1 23007.3333333 18767.6666667 6.0 117552.666667
1.09861 0.0 1.09861 1.09861 1 45482.6666667 336.666666667 5.33333333333 43675.0
0.0 0.0 3.68888 3.68888 1 43120.6666667 3764.66666667 10.3333333333 146793.666667
0.0 0.0 7.94307 7.94307 1 39282.3333333 11854.6666667 11.3333333333 119140.333333
1.09861 2.19722 9.4314 9.4314 1 16680.0 7637.66666667 10.6666666667 117552.666667
[6999251 rows x 19 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [13]:
del n_a
del n_r
del n_c
del n_p

In [45]:
len(test[test['lag_sum']==test['prior_sum']])


Out[45]:
6980947

SAVE


In [12]:
train.save(path+'train_lag5_w8.csv',format='csv')

In [41]:
test.save(path+'test_lag5_w9.csv',format='csv')

In [ ]:


In [4]:
'''prior sum for week8'''
sf = sf[sf['Semana']>4]
sum10 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum10['Semana']=10
sf = sf[sf['Semana']>5]
sum11 = sf.groupby(key_columns=['Cliente_ID','Producto_ID'], operations={'prior_sum':agg.SUM('Demada_log')})
sum11['Semana']=11
sum1011 = sum10.append(sum11)

In [22]:
test = gl.SFrame.read_csv(path + 'test_w9_fs.csv', verbose=False)
print test


+---------+--------+------------+----------+------------+-------------+----------+
|    id   | Semana | Agencia_ID | Ruta_SAK | Cliente_ID | Producto_ID |   lag1   |
+---------+--------+------------+----------+------------+-------------+----------+
| 4600038 |   10   |    2057    |   2160   |  7813953   |    31507    | 1.94591  |
| 5502478 |   10   |    1344    |   1206   |   917733   |    43201    | 0.693147 |
| 5099473 |   10   |   24539    |   1003   |  1815419   |    34204    | 1.09861  |
| 3792782 |   10   |    4041    |   1157   |   64384    |     1240    | 1.09861  |
| 1668210 |   10   |    2024    |   1106   |   300511   |     1216    | 0.693147 |
| 6411079 |   10   |    1419    |   1093   |  9514906   |    34292    | 1.60944  |
| 6162760 |   10   |    1221    |   4427   |  2201146   |    43285    | 1.94591  |
| 6046475 |   10   |    2032    |   1067   |   243264   |      73     | 1.09861  |
| 5338424 |   10   |    1387    |   1104   |   411722   |    43202    | 1.94591  |
|  869049 |   10   |    1111    |   1021   |  2349920   |     2665    | 1.38629  |
+---------+--------+------------+----------+------------+-------------+----------+
+----------+----------+----------+----------+---------+-----------+------------+
|   lag2   |   lag3   |   lag4   |   lag5   | lag_sum | prior_sum | week_times |
+----------+----------+----------+----------+---------+-----------+------------+
| 1.94591  | 1.94591  |   0.0    |   0.0    | 5.83773 |  5.83773  |     1      |
| 0.693147 |   0.0    |   0.0    |   0.0    | 1.38629 |  1.38629  |     1      |
| 2.70805  | 2.70805  | 1.60944  | 3.09104  | 11.2152 |  16.5623  |     1      |
| 1.60944  | 1.09861  | 0.693147 | 1.38629  |  5.8861 |  8.88184  |     1      |
|   0.0    |   0.0    |   0.0    | 1.09861  | 1.79176 |  1.79176  |     1      |
|   0.0    | 1.38629  | 1.38629  | 1.09861  | 5.48064 |  5.48064  |     1      |
|   0.0    |   0.0    |   0.0    |  3.2581  | 5.20401 |  7.91206  |     1      |
| 0.693147 | 0.693147 | 1.09861  | 0.693147 | 4.27667 |  4.27667  |     1      |
|   0.0    |   0.0    |   0.0    |   0.0    | 1.94591 |  1.94591  |     1      |
| 2.07944  | 1.60944  | 1.09861  | 1.79176  | 7.96555 |  11.0101  |     1      |
+----------+----------+----------+----------+---------+-----------+------------+
+---------+---------+---------+----------+------+-----+-----+
|   n_a   |   n_r   |   n_c   |   n_p    | Town | t_c | ... |
+---------+---------+---------+----------+------+-----+-----+
| 40847.3 |  4680.0 | 3.33333 | 42043.7  |  32  |  3  | ... |
| 25304.3 | 29226.3 |   6.0   | 15866.7  | 131  |  3  | ... |
|  7047.0 | 19827.7 | 17.6667 | 2021.67  | 197  |  1  | ... |
| 22477.7 | 12876.3 |   12.0  | 142714.0 | 153  |  4  | ... |
| 13246.7 | 20512.7 |   11.0  | 71232.0  |  72  |  2  | ... |
| 39039.0 |  457.0  | 28.6667 | 5439.67  | 206  |  3  | ... |
| 16089.7 | 1392.33 | 14.3333 | 98334.7  |  8   |  8  | ... |
| 32083.0 | 5770.33 |   24.0  | 17564.7  |  57  |  5  | ... |
| 21309.7 | 25222.7 | 16.3333 | 26207.7  | 106  |  5  | ... |
| 34838.7 | 12660.3 | 30.3333 | 25259.0  |  1   |  4  | ... |
+---------+---------+---------+----------+------+-----+-----+
[6999251 rows x 30 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [23]:
del test['prior_sum']
test = test.join(sum1011,on=['Semana','Cliente_ID','Producto_ID'],how='left')
test = test.fillna('prior_sum',0)
test.save(path+'test_fs.csv',format='csv')

In [ ]:


In [ ]:


In [5]:
test = gl.SFrame.read_csv(path + 'test_lag5_w8.csv', verbose=False)

In [7]:
test = test.fillna('prior_sum.1',0)

In [10]:
test['prior_sum']=test['prior_sum.1']

In [12]:
del test['prior_sum.1']

In [14]:
test.save(path+'test_lag5_w8.csv',format='csv')

In [ ]: