In [1]:
import feats
import constants
import transactions
import utils


import os
import pickle
import numpy as np
import pandas as pd
from math import sqrt
from imp import reload

from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


Using TensorFlow backend.

Load Dataset


In [2]:
with open(constants.FEAT_DATA_DIR + 'up_interval_feat.pkl', 'rb') as f:
    up_interval_feat = pickle.load(f)

In [3]:
# 7 seconds
up_interval_feat['X'] = up_interval_feat.p_purchase_interval.apply(lambda x:x[:-1])
up_interval_feat['y'] = up_interval_feat.p_purchase_interval.apply(lambda x:x[-1])

In [4]:
# 14 seconds
max_seq_len = 90 # 90 # 10
X = pad_sequences(up_interval_feat.X.values, maxlen=max_seq_len)
y = up_interval_feat.y.values

In [5]:
X = np.reshape(X, (X.shape[0], X.shape[1], 1))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Regression LSTM


In [12]:
def regres_lstm(lstm_hidden_units, num_time_step, num_feat):
    model = Sequential()
    model.add(LSTM(lstm_hidden_units, return_sequences = True,
                  input_shape = (num_time_step, num_feat)))
    # model.add(LSTM(lstm_hidden_units, return_sequences = True,
    #              input_shape = (num_time_step, num_feat)))
    model.add(LSTM(lstm_hidden_units, return_sequences = False,
                  input_shape = (num_time_step, num_feat)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [13]:
# model = regres_lstm(32, 10, 1)
model = regres_lstm(128, 90, 1)

In [14]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_3 (LSTM)                (None, 90, 128)           66560     
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
=================================================================
Total params: 202,305
Trainable params: 202,305
Non-trainable params: 0
_________________________________________________________________

In [15]:
filepath="./__lstm_cache__/" + "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [ ]:
model.fit(X_train, y_train, batch_size=256, epochs=10, validation_split=0.2, callbacks=callbacks_list)


Train on 1981178 samples, validate on 495295 samples
Epoch 1/10
1980928/1981178 [============================>.] - ETA: 0s - loss: 1638.9518Epoch 00000: loss improved from inf to 1638.99821, saving model to ./__lstm_cache__/weights-improvement-00-1638.9982.hdf5
1981178/1981178 [==============================] - 2217s - loss: 1638.9982 - val_loss: 1640.9863
Epoch 2/10
1980928/1981178 [============================>.] - ETA: 0s - loss: 1635.8535Epoch 00001: loss improved from 1638.99821 to 1635.83705, saving model to ./__lstm_cache__/weights-improvement-01-1635.8371.hdf5
1981178/1981178 [==============================] - 2117s - loss: 1635.8371 - val_loss: 1640.9780
Epoch 3/10
1980928/1981178 [============================>.] - ETA: 0s - loss: 1635.7965Epoch 00002: loss improved from 1635.83705 to 1635.83570, saving model to ./__lstm_cache__/weights-improvement-02-1635.8357.hdf5
1981178/1981178 [==============================] - 2182s - loss: 1635.8357 - val_loss: 1640.9985
Epoch 4/10
 182784/1981178 [=>............................] - ETA: 2168s - loss: 1636.5614

In [ ]:
def regres_lstm_pred(model, X, y):
    y_pred = model.predict(X_test).flatten()
    y_persis = X_test[:, -1].flatten()
    res = pd.DataFrame({'y_pred':y_pred, 'y_persis':y_persis, 'y_gold':y})
    return res

In [ ]:
y_pred = model.predict(X_test)

In [ ]:
y_baseline = X_test[:, -1]

In [ ]:
res_test = pd.DataFrame({'y_pred':y_pred.flatten(), 'y_baseline':y_baseline.flatten(), 'y_test':y_test})

In [ ]:
res_test.describe()

In [ ]:
rmse_lstm = sqrt(mean_squared_error(res_test.y_test, res_test.y_pred))

In [ ]:
rmse_persis = sqrt(mean_squared_error(res_test.y_test, res_test.y_baseline))

In [ ]:
print("RMSE of Persistence Model is %f \nRMSE of LSTM Model is %f"%(rmse_persis, rmse_lstm))

Check other models


In [ ]:
model.load_weights('./__lstm_cache__/weights-improvement-05-1514.2298.hdf5')

In [ ]:
res = regres_lstm_pred(model, X_test, y_test)

In [ ]:
print("RMSE of Persistence Model is %f \nRMSE of LSTM Model is %f" % 
      (cal_rmse(res.y_gold, res.y_persis), cal_rmse(res.y_gold, res.y_pred)))

Predict all orders


In [ ]:
max_seq_len = 10
X_all = pad_sequences(up_interval_feat.up_interval.values, maxlen=max_seq_len)

In [ ]:
X_all = np.reshape(X_all, (X_all.shape[0], X_all.shape[1], 1))

In [ ]:
#24 minutes
y_all = model.predict(X_all)

In [ ]:
with open('./y_all.pkl', 'wb') as f:
    pickle.dump(y_all, f, pickle.HIGHEST_PROTOCOL)

In [ ]:
with open('./y_all.pkl', 'rb') as f:
    y_all = pickle.load(f)

Validate with train orders

  • y_all(u,p)在prior中出现至少两次,即用户至少购买两次商品,过滤掉只购买一次的商品
  • 在prior订单中,只购买一次的商品如何预测呢?

In [ ]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)

In [ ]:
order_products_train = tle.get_orders_items('train')

order_products_prior = tle.get_orders_items('prior')

orders = tle.get_orders()

In [ ]:
users_orders = tle.get_users_orders()

In [ ]:
with open(constants.FEAT_DATA_DIR + 'interact_feat.pkl', 'rb') as f:
    up_interact_feat = pickle.load(f)

In [ ]:
len(up_interact_feat)

In [ ]:
print(len(up_interval_feat), len(up_latest))

In [ ]:
up_interval = up_interval_feat[['user_id', 'product_id']]

In [ ]:
up_interval['pred'] = y_all.flatten()

In [ ]:
up_interval = pd.merge(up_interval, 
                       up_interact_feat[['user_id', 'product_id', 'up_days_to_last']], 
                       on = ['user_id', 'product_id'], 
                       how = 'left')

In [ ]:
up_interval['delta'] = up_interval['pred'] - up_interval['up_days_to_last']

In [ ]:
up_interval['abs_delta'] = up_interval['delta'].apply(abs)

In [ ]:
up_interval.describe()

In [ ]:
up_interval.columns = ['user_id', 'product_id', 'pred', 'up_days_to_last', 'up_delta',
       'up_abs_delta']

In [ ]:
up_delta = up_interval[['user_id', 'product_id', 'up_delta', 'up_abs_delta']]

In [ ]:
with open(constants.FEAT_DATA_DIR + 'up_delta.pkl', 'wb') as f:
    pickle.dump(up_delta, f, pickle.HIGHEST_PROTOCOL)

In [ ]:
with open(constants.FEAT_DATA_DIR + 'label.pkl', 'rb') as f:
    label = pickle.load(f)

In [ ]:
up_interval = pd.merge(up_interval, label, on = ['user_id', 'product_id'], how = 'left')

In [ ]:
up_interval['label'] = up_interval.label.fillna(0.0)

In [ ]:
up_interval.label.describe()

In [ ]:
X = up_interval.abs_diff.values
X = np.reshape(X, (X.shape[0], 1))

In [ ]:
y = up_interval.label.values

In [ ]: