In [1]:
import feats
import constants
import transactions
import utils
import os
import pickle
import numpy as np
import pandas as pd
from math import sqrt
from imp import reload
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
In [2]:
with open(constants.FEAT_DATA_DIR + 'up_interval_feat.pkl', 'rb') as f:
up_interval_feat = pickle.load(f)
In [3]:
# 7 seconds
up_interval_feat['X'] = up_interval_feat.p_purchase_interval.apply(lambda x:x[:-1])
up_interval_feat['y'] = up_interval_feat.p_purchase_interval.apply(lambda x:x[-1])
In [4]:
# 14 seconds
max_seq_len = 90 # 90 # 10
X = pad_sequences(up_interval_feat.X.values, maxlen=max_seq_len)
y = up_interval_feat.y.values
In [5]:
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
In [12]:
def regres_lstm(lstm_hidden_units, num_time_step, num_feat):
model = Sequential()
model.add(LSTM(lstm_hidden_units, return_sequences = True,
input_shape = (num_time_step, num_feat)))
# model.add(LSTM(lstm_hidden_units, return_sequences = True,
# input_shape = (num_time_step, num_feat)))
model.add(LSTM(lstm_hidden_units, return_sequences = False,
input_shape = (num_time_step, num_feat)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
return model
In [13]:
# model = regres_lstm(32, 10, 1)
model = regres_lstm(128, 90, 1)
In [14]:
model.summary()
In [15]:
filepath="./__lstm_cache__/" + "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
In [ ]:
model.fit(X_train, y_train, batch_size=256, epochs=10, validation_split=0.2, callbacks=callbacks_list)
In [ ]:
def regres_lstm_pred(model, X, y):
y_pred = model.predict(X_test).flatten()
y_persis = X_test[:, -1].flatten()
res = pd.DataFrame({'y_pred':y_pred, 'y_persis':y_persis, 'y_gold':y})
return res
In [ ]:
y_pred = model.predict(X_test)
In [ ]:
y_baseline = X_test[:, -1]
In [ ]:
res_test = pd.DataFrame({'y_pred':y_pred.flatten(), 'y_baseline':y_baseline.flatten(), 'y_test':y_test})
In [ ]:
res_test.describe()
In [ ]:
rmse_lstm = sqrt(mean_squared_error(res_test.y_test, res_test.y_pred))
In [ ]:
rmse_persis = sqrt(mean_squared_error(res_test.y_test, res_test.y_baseline))
In [ ]:
print("RMSE of Persistence Model is %f \nRMSE of LSTM Model is %f"%(rmse_persis, rmse_lstm))
In [ ]:
model.load_weights('./__lstm_cache__/weights-improvement-05-1514.2298.hdf5')
In [ ]:
res = regres_lstm_pred(model, X_test, y_test)
In [ ]:
print("RMSE of Persistence Model is %f \nRMSE of LSTM Model is %f" %
(cal_rmse(res.y_gold, res.y_persis), cal_rmse(res.y_gold, res.y_pred)))
In [ ]:
max_seq_len = 10
X_all = pad_sequences(up_interval_feat.up_interval.values, maxlen=max_seq_len)
In [ ]:
X_all = np.reshape(X_all, (X_all.shape[0], X_all.shape[1], 1))
In [ ]:
#24 minutes
y_all = model.predict(X_all)
In [ ]:
with open('./y_all.pkl', 'wb') as f:
pickle.dump(y_all, f, pickle.HIGHEST_PROTOCOL)
In [ ]:
with open('./y_all.pkl', 'rb') as f:
y_all = pickle.load(f)
In [ ]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
In [ ]:
order_products_train = tle.get_orders_items('train')
order_products_prior = tle.get_orders_items('prior')
orders = tle.get_orders()
In [ ]:
users_orders = tle.get_users_orders()
In [ ]:
with open(constants.FEAT_DATA_DIR + 'interact_feat.pkl', 'rb') as f:
up_interact_feat = pickle.load(f)
In [ ]:
len(up_interact_feat)
In [ ]:
print(len(up_interval_feat), len(up_latest))
In [ ]:
up_interval = up_interval_feat[['user_id', 'product_id']]
In [ ]:
up_interval['pred'] = y_all.flatten()
In [ ]:
up_interval = pd.merge(up_interval,
up_interact_feat[['user_id', 'product_id', 'up_days_to_last']],
on = ['user_id', 'product_id'],
how = 'left')
In [ ]:
up_interval['delta'] = up_interval['pred'] - up_interval['up_days_to_last']
In [ ]:
up_interval['abs_delta'] = up_interval['delta'].apply(abs)
In [ ]:
up_interval.describe()
In [ ]:
up_interval.columns = ['user_id', 'product_id', 'pred', 'up_days_to_last', 'up_delta',
'up_abs_delta']
In [ ]:
up_delta = up_interval[['user_id', 'product_id', 'up_delta', 'up_abs_delta']]
In [ ]:
with open(constants.FEAT_DATA_DIR + 'up_delta.pkl', 'wb') as f:
pickle.dump(up_delta, f, pickle.HIGHEST_PROTOCOL)
In [ ]:
with open(constants.FEAT_DATA_DIR + 'label.pkl', 'rb') as f:
label = pickle.load(f)
In [ ]:
up_interval = pd.merge(up_interval, label, on = ['user_id', 'product_id'], how = 'left')
In [ ]:
up_interval['label'] = up_interval.label.fillna(0.0)
In [ ]:
up_interval.label.describe()
In [ ]:
X = up_interval.abs_diff.values
X = np.reshape(X, (X.shape[0], 1))
In [ ]:
y = up_interval.label.values
In [ ]: