In [6]:
import feats
import utils
import constants
import transactions
import os
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from imp import reload
from matplotlib import pyplot as plt
from statsmodels.tsa.api import VAR
from scipy.spatial.distance import euclidean
from sklearn.utils.extmath import cartesian
from sklearn.feature_extraction.text import CountVectorizer
from pandas.tools.plotting import lag_plot, autocorrelation_plot
In [3]:
uo = tle.get_users_orders('prior')
In [4]:
up_pair = uo[['user_id', 'product_id']].drop_duplicates()
In [ ]:
order_products_train = tle.get_orders_items('train')
In [33]:
order_products_prior = tle.get_orders_items('prior')
orders = tle.get_orders()
products = tle.get_items('products')
aisles = tle.get_items('aisles')
departments = tle.get_items('departments')
In [12]:
products_details = pd.merge(products, tle.craft_feat_product(), on = ['product_id'], how = 'right')
In [6]:
order_is_None = order_products_train.groupby(['order_id'])['reordered'].sum().reset_index()
In [11]:
len(order_is_None[order_is_None.reordered == 0]) / len(order_is_None[order_is_None.reordered > 0])
Out[11]:
In [19]:
a = pd.merge(order_is_None, orders, how = 'left', on = ['order_id'])
In [65]:
order_products_all = pd.concat([order_products_prior, order_products_train], axis = 0)
In [21]:
grouped = order_products_prior.groupby("order_id")["add_to_cart_order"].aggregate("max").reset_index()
In [22]:
grouped.add_to_cart_order.describe()
Out[22]:
In [23]:
grouped = pd.merge(grouped,
orders,
on = ['order_id'],
how = 'left')[['user_id', 'add_to_cart_order', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]
In [24]:
grouped = grouped.sort_values(['user_id', 'order_number'])
In [25]:
grouped.columns = ['user_id',
'num_products',
'order_number',
'order_dow',
'order_hour_of_day',
'days_since_prior_order']
In [18]:
user_num_product = grouped.groupby(['user_id'])['num_products'].agg({'mean':'mean', 'std':'std'})
In [ ]:
with open(DATA_DIR + 'user_num_product_stat.pkl', 'wb') as f:
pickle.dump(user_num_product, f, pic)
In [3]:
with open(constants.FEAT_DATA_DIR + 'user_num_product_stat.pkl', 'rb') as f:
user_num_product = pickle.load(f)
In [7]:
user_num_product['std'].describe()
Out[7]:
In [26]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]
In [27]:
prior_reorder_rate = order_products_prior.groupby(['order_id'])['reordered'] \
.aggregate({'reorder_pnum':'sum', 'pnum':'count'})
In [28]:
prior_reorder_rate['reorder_rate'] = prior_reorder_rate['reorder_pnum'] / prior_reorder_rate['pnum']
In [29]:
prior_reorder_rate.reset_index(inplace=True)
In [30]:
prior_orders = orders[orders.eval_set == 'prior']
In [31]:
prior_orders = pd.merge(prior_orders, prior_reorder_rate,
on = ['order_id'], how = 'left')
In [32]:
prior_orders.head(5)
Out[32]:
In [33]:
user_reorder_est = prior_orders.groupby(['user_id'])['reorder_pnum']\
.aggregate({'reorder_pnum_mean':'mean',
'reorder_pnum_std':'std'}).reset_index()
In [34]:
user_reorder_est = user_reorder_est[['user_id', 'reorder_pnum_mean', 'reorder_pnum_std']]
In [35]:
with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'wb') as f:
pickle.dump(user_reorder_est, f, pickle.HIGHEST_PROTOCOL)
In [3]:
with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'rb') as f:
user_reorder_est = pickle.load(f)
In [10]:
user_reorder_est.reorder_pnum_std.describe()
Out[10]:
In [ ]:
users_products = pd.merge(prior_orders, order_products_prior, on = ['order_id'], how = 'left')
In [20]:
users_products = users_products.groupby(['user_id'])['product_id'].apply(list).reset_index()
In [22]:
with open(DATA_DIR + 'user_product.pkl', 'wb') as f:
pickle.dump(users_products, f, pickle.HIGHEST_PROTOCOL)
In [3]:
with open(constants.FEAT_DATA_DIR + 'user_product.pkl', 'rb') as f:
users_products = pickle.load(f)
In [6]:
l = users_products.product_id.apply(len)
In [10]:
l.describe()
Out[10]:
In [394]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']
In [13]:
grouped = orders.order_hour_of_day.value_counts()
sns.set_style('darkgrid')
sns.barplot(grouped.index, grouped.values)
plt.show()
In [15]:
# term-frequency matrix construct
orders = pd.read_csv(DATA_DIR + 'orders.csv')
users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id']],
on = ['order_id'], how = 'left')
users_products_matrix = users_orders.groupby(['user_id'])['product_id'].apply(series_to_str)
tf = CountVectorizer(analyzer = 'word', lowercase = False, max_df=0.95, min_df=2,)
tf_matrix = tf.fit_transform(users_products_matrix.values)
tf_feature_names = tf.get_feature_names()
with open(DATA_DIR + 'tf.model', 'wb') as f:
pickle.dump(tf, f, pickle.HIGHEST_PROTOCOL)
In [56]:
#订单的Topic, tf为CountVector,将文档转化为term-frequency矩阵
op = order_products_prior.groupby(['order_id'])['product_id'].apply(series_to_str)
topic_order = pd.DataFrame(lda.transform(tf.transform(op.values)), columns= ["topic_%d"%x for x in range(10)])
topic_order['order_id'] = op.index.values
with open(DATA_DIR + 'order_topic_norm.pkl', 'wb') as f:
pickle.dump(topic_order_norm, f, pickle.HIGHEST_PROTOCOL)
In [65]:
up_distance = pd.merge(users_orders[['user_id', 'product_id']].drop_duplicates(),
user_topic,
on = ['user_id'],
how = 'left')
up_distance.columns = ['user_id', 'product_id'] + ["u_topic_%d"%x for x in range(10)]
up_distance = pd.merge(up_distance,
topic_product,
on = ['product_id'],
how = 'left')
up_distance.columns = ['user_id', 'product_id'] + ["u_topic_%d"%x for x in range(10)] + ["p_topic_%d"%x for x in range(10)]
In [87]:
def cal_up_distance(subf):
u_topic = subf[["u_topic_%d"%x for x in range(10)]]
p_topic = subf[["p_topic_%d"%x for x in range(10)]]
upd = euclidean(u_topic, p_topic)
return upd
In [92]:
# 3 hours
up_distance['up_dis'] = up_distance.apply(cal_up_distance, axis = 1)
In [94]:
up_distance = up_distance[['user_id', 'product_id', 'up_dis']]
with open(DATA_DIR + 'upd_feat.pkl', 'wb') as f:
pickle.dump(up_distance, f, pickle.HIGHEST_PROTOCOL)
In [309]:
order_topic = pd.merge(order_products_prior[['order_id', 'product_id']],
topic_product,
on = ['product_id'],
how = 'inner')#throw stop words
In [312]:
order_topic = order_topic.groupby(['order_id'])[["topic_%d"%x for x in range(10)]].sum().reset_index()
In [314]:
unorm = order_topic[["topic_%d"%x for x in range(10)]].values
In [315]:
order_topic[["topic_%d"%x for x in range(10)]] = unorm / unorm.sum(axis = 1)[:,np.newaxis]
In [301]:
len(order_products_prior.product_id.unique())
Out[301]:
In [302]:
len(topic_product.product_id.unique())
Out[302]:
In [1]:
import constants, utils, transactions, feats
from imp import reload
In [3]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
In [33]:
train_none = feats.make_train_or_test_none(tle, 'train')
In [34]:
test_none = feats.make_train_or_test_none(tle, 'test')
In [4]:
train = feats.make_train_or_test(tle, 'train')
In [ ]:
utils.check_inf_nan(train[up_cols])
In [ ]:
utils.check_inf_nan(train[ua_cols])
In [ ]:
utils.check_inf_nan(train[ud_cols])
In [7]:
utils.check_inf_nan(train[p_cols])
Out[7]:
In [ ]:
utils.check_inf_nan(train[a_cols])
In [ ]:
utils.check_inf_nan(train[d_cols])
In [ ]:
utils.check_inf_nan(train[ctx_cols])
In [ ]:
utils.check_inf_nan(train[topic_cols])
间隔、加购物车次序作为Symbol
次序
间隔
实现
In [4]:
users_orders = tle.get_users_orders('prior')
In [ ]:
product_feat = tle.craft_feat_item('products')
In [255]:
user_feat = tle.craft_feat_user()
In [256]:
users_orders = pd.merge(users_orders, product_feat[['product_id', 'p_reorder_probability']], on=['product_id'], how='left')
In [257]:
users_orders = pd.merge(users_orders, user_feat[['user_id', 'u_total_reorders']], on=['user_id'], how='left')
In [258]:
def encode_numeric(row, bins):
'''
convert numeric variable into binned category
bins = [b1, b2, b3, b4]
'''
index = ~(row < bins)
return [bins[index][-1]]
In [321]:
add2cart_bins = np.array([1, 2, 3, 4, 7, 12], dtype=float) # 6
interval_bins = np.array([-1, 4, 8, 17, 34], dtype=float)# 5
p_reorder_bins = np.array([0.0, 0.20, 0.38, 0.53], dtype=float)# 4
u_reorder_bins = np.array([0, 10, 33, 101], dtype=float)# 4
In [ ]:
%%time
users_orders = users_orders.sort_values(['user_id', 'product_id', 'order_number'], ascending = False)
users_orders['up_interval'] = users_orders.groupby(['user_id', 'product_id'])['days_up_to_last'].diff()
users_orders.up_interval.fillna(-1, inplace=True)
users_orders['up_interval_sym'] = users_orders.up_interval.apply(lambda x: encode_numeric(x, interval_bins))
users_orders['up_add2cart_order_sym'] = users_orders.add_to_cart_order.apply(lambda x: encode_numeric(x, add2cart_bins))
In [265]:
users_orders['p_reorder_prob_sym'] = users_orders.p_reorder_probability.apply(lambda x: encode_numeric(x, p_reorder_bins))
users_orders['u_reorder_sym'] = users_orders.u_total_reorders.apply(lambda x:encode_numeric(x, u_reorder_bins))
In [322]:
feat_card = [add2cart_bins, interval_bins, p_reorder_bins, u_reorder_bins]
In [323]:
feat_cartesian = cartesian(feat_card)
In [327]:
users_orders['up_card'] = users_orders.up_add2cart_order_sym + users_orders.up_interval_sym + users_orders.p_reorder_prob_sym + users_orders.u_reorder_sym
In [337]:
def encode_cartesian(row, feat_cartesian):
'''
lookup table
turn a group of categorical variable into a symbol
'''
sym = np.where(np.all(row == feat_cartesian,axis=1))[0][0] + 1
return sym
In [340]:
%%time
users_orders['up_airr_sym'] = users_orders.up_card.apply(lambda x: encode_cartesian(x, feat_cartesian))
In [352]:
up_airr_sym = users_orders[['user_id', 'product_id', 'order_number', 'up_airr_sym']]
In [354]:
up_airr_sym.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)
In [356]:
up_airr_sym_list = up_airr_sym.groupby(['user_id', 'product_id'])['up_airr_sym'].apply(list).reset_index()
In [358]:
with open(constants.FEAT_DATA_DIR + 'up_airr_sym.pkl', 'wb') as f:
pickle.dump(up_airr_sym_list, f, pickle.HIGHEST_PROTOCOL)
Time Series Forcasting 问题
方案2:LSTM 仅仅包含购买间隔信息
预处理
In [3]:
users_orders = tle.get_users_orders(prior_or_train='prior')
In [4]:
a = users_orders[['user_id', 'order_number', 'product_id', 'days_up_to_last', 'p_purchase_interval']].sort_values(['user_id', 'order_number', 'p_purchase_interval'])
In [5]:
del users_orders
In [10]:
a.sort_values(['user_id', 'product_id', 'order_number'], ascending=False, inplace=True)
In [11]:
%%time
a['up_interval'] = a.head(1000).groupby(['user_id', 'product_id'])['days_up_to_last'].diff()
In [13]:
a.sort_values(['user_id', 'product_id'])
Out[13]:
In [4]:
print("number of (u,p,t) tuples: %d"%len(users_orders))
In [ ]:
del users_orders # free memory usage
In [12]:
users_orders_intervals = users_orders.dropna() #throw away product_id bought only once
In [16]:
users_orders_intervals = users_orders_intervals[users_orders_intervals.p_purchase_interval > 0] # throw away record buy in the same day
In [18]:
users_orders_intervals = users_orders_intervals.sort_values(['user_id', 'product_id', 'order_number'])
In [19]:
%%time
up_interval_list = users_orders_intervals.groupby(['user_id', 'product_id'])['p_purchase_interval'].apply(list).reset_index()
In [20]:
len(up_interval_list)
Out[20]:
In [22]:
del users_orders_intervals # free memory usage
In [24]:
up_interval_list['len'] = up_interval_list.p_purchase_interval.apply(lambda x: len(x))
In [25]:
up_interval_list = up_interval_list[up_interval_list.len >= 2] # for train/test split
In [ ]:
with open(constants.FEAT_DATA_DIR + 'up_interval_feat.pkl', 'wb') as f:
pickle.dump(up_interval_list, f, pickle.HIGHEST_PROTOCOL)
In [ ]:
len(up_interval_list)
In [ ]:
up_interval_list.len.describe()