In [1]:
import constants
import feats
import transactions
import utils
import pickle
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [2]:
from imp import reload
In [3]:
reload(transactions)
Out[3]:
In [4]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
In [5]:
order_products_prior = tle.get_orders_items('prior')
order_products_train = tle.get_orders_items('train')
order_products_all = pd.concat([order_products_train, order_products_prior], axis=0)
order_products_all = order_products_all.sort_values(['order_id', 'add_to_cart_order'])
In [10]:
#方案2
orders = tle.get_orders()
users_orders = tle.get_users_orders('prior')
# 将product_id转换为str
users_products_matrix = users_orders.groupby(['user_id'])['product_id'].apply(utils.series_to_str)
# 构造vocabulary
tf = CountVectorizer(analyzer = 'word', lowercase = False, max_df=0.95, min_df=2,)
tf_matrix = tf.fit_transform(users_products_matrix.values)
tf_feature_names = tf.get_feature_names()
#with open(DATA_DIR + 'user_tf_matrix', 'wb') as f:
# pickle.dump(tf_matrix, f, pickle.HIGHEST_PROTOCOL)
products = tle.get_items('products')
aisles = tle.get_items('aisles')
departments = tle.get_items('departments')
product_a = pd.merge(products, aisles, on = ['aisle_id'], how = 'left')
product_ad = pd.merge(product_a, departments, on = ['department_id'], how = 'left')
del product_ad['aisle_id']
del product_ad['department_id']
product_ad['chain_product_name'] = product_ad['department'] + ' _ ' +\
product_ad['aisle'] + ' _ ' +\
product_ad['product_name']
tf_product_names = [] #商品名列表
for pid in tf_feature_names:
tf_product_names.append(product_ad[product_ad.product_id == int(pid)]['chain_product_name'].values[0])
In [ ]:
tf_info = pd.DataFrame({'pid':tf_feature_names, 'pname':tf_product_names})
In [10]:
#with open(DATA_DIR + 'user_tf_matrix_info.pkl', 'wb') as f:
# pickle.dump(tf_info, f, pickle.HIGHEST_PROTOCOL)
In [2]:
orders = tle.get_orders()
users_orders = tle.get_users_orders('prior')
In [15]:
def pad_tf(pad, users_orders):
# 将product_id转换为str
pad_users_matrix = users_orders.groupby([pad])['user_id'].apply(utils.series_to_str)
# 构造vocabulary
tf = CountVectorizer(analyzer = 'word', lowercase = False, max_df=0.95, min_df=2,)
tf_matrix = tf.fit_transform(pad_users_matrix.values)
tf_feature_names = tf.get_feature_names()
tf_info = pd.DataFrame({'term_id':np.arange(len(tf_feature_names)), 'user_id':tf_feature_names})
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_tf_matrix', 'wb') as f:
pickle.dump(tf_matrix, f, pickle.HIGHEST_PROTOCOL)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_tf_info', 'wb') as f:
pickle.dump(tf_info, f, pickle.HIGHEST_PROTOCOL)
In [16]:
for pad in ['product_id', 'aisle_id', 'department_id']:
pad_tf(pad, users_orders)
In [2]:
pad = 'aisle_id'
In [3]:
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_tf_matrix', 'rb') as f:
a_tf_matrix = pickle.load(f)
In [5]:
a_tf_matrix.shape
Out[5]:
In [6]:
pad = 'product_id'
In [7]:
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_tf_matrix', 'rb') as f:
p_tf_matrix = pickle.load(f)
In [8]:
p_tf_matrix.shape
Out[8]:
In [5]:
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora
In [6]:
import constants
In [7]:
# raw data
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
users_orders = tle.get_users_orders('prior')
In [15]:
%%time
def pad_tf(pad, users_orders):
pad_user = users_orders.groupby([pad])['user_id'].apply(utils.series_to_str) # convert into str
pad_user = [doc.split() for doc in pad_user.values] # split into arrays
dictionary = corpora.Dictionary(pad_user) # create dictionary
pad_term_matrix = [dictionary.doc2bow(doc) for doc in pad_user]
return dictionary, pad_term_matrix
for pad in ['product_id', 'aisle_id', 'department_id']:
p_dict, p_term_matrix = pad_tf(pad, users_orders)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_dict', 'wb') as f:
pickle.dump(p_dict, f, pickle.HIGHEST_PROTOCOL)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'wb') as f:
pickle.dump(p_term_matrix, f, pickle.HIGHEST_PROTOCOL)
In [7]:
pad = 'department_id'
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'rb') as f:
p_term_matrix = pickle.load(f)
In [5]:
%%time
pad = 'product_id'
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'rb') as f:
p_term_matrix = pickle.load(f)
for n in [10, 60, 110, 160, 210]:
print('number of topics: %d'%n)
lda = LdaMulticore.load(constants.LDA_DIR + 'p_gensim_lda_%d'%n)
print(lda.log_perplexity(p_term_matrix, total_docs=1677))
In [5]:
num_topic = 10
lda = LdaMulticore.load(constants.LDA_DIR + 'p_gensim_lda_%d'%num_topic)
pad = 'product_id'
pad_user = users_orders.groupby([pad])['user_id'].apply(utils.series_to_str)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_dict', 'rb') as f:
p_dict = pickle.load(f)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'rb') as f:
p_term_matrix = pickle.load(f)
p_topics = [[v for k,v in lda.get_document_topics(p, minimum_phi_value=0, minimum_probability=0)] for p in p_term_matrix]
p_topics = pd.DataFrame(p_topics, columns = ['p_topic_%d'%n for n in range(num_topic)])
p_topics['product_id'] = pad_user.index.values
user_id = [int(token) for word_id, token in p_dict.iteritems()]
u_topics = [{k:v for k,v in lda.get_term_topics(word_id, minimum_probability=0)} for word_id, _ in p_dict.iteritems()]
u_topics = pd.DataFrame(u_topics).fillna(0)
u_topics.columns = ['u_topic_%d'%i for i in range(num_topic)]
u_topics = u_topics / u_topics.sum() # column normalization, make sure each topic sums to 1
u_topics = (u_topics.transpose() / u_topics.transpose().sum()).transpose() # row normalization
u_topics['user_id'] = user_id
with open(constants.FEAT_DATA_DIR + 'pad_p_topic.pkl', 'wb') as f:
pickle.dump(p_topics, f, pickle.HIGHEST_PROTOCOL)
with open(constants.FEAT_DATA_DIR + 'pad_p_u_topic.pkl', 'wb') as f:
pickle.dump(u_topics, f, pickle.HIGHEST_PROTOCOL)
In [37]:
num_topic = 10
lda = LdaMulticore.load(constants.LDA_DIR + 'a_gensim_lda_%d'%num_topic)
pad = 'aisle_id'
pad_user = users_orders.groupby([pad])['user_id'].apply(utils.series_to_str)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_dict', 'rb') as f:
p_dict = pickle.load(f)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'rb') as f:
p_term_matrix = pickle.load(f)
p_topics = [[v for k,v in lda.get_document_topics(p, minimum_phi_value=0, minimum_probability=0)] for p in p_term_matrix]
p_topics = pd.DataFrame(p_topics, columns = ['p_topic_%d'%n for n in range(num_topic)])
p_topics['aisle_id'] = pad_user.index.values
user_id = [int(token) for word_id, token in p_dict.iteritems()]
u_topics = [{k:v for k,v in lda.get_term_topics(word_id, minimum_probability=0)} for word_id, _ in p_dict.iteritems()]
u_topics = pd.DataFrame(u_topics).fillna(0)
u_topics.columns = ['u_topic_%d'%i for i in range(num_topic)]
u_topics = u_topics / u_topics.sum() # column normalization, make sure each topic sums to 1
u_topics = (u_topics.transpose() / u_topics.transpose().sum()).transpose() # row normalization
u_topics['user_id'] = user_id
with open(constants.FEAT_DATA_DIR + 'pad_a_topic.pkl', 'wb') as f:
pickle.dump(p_topics, f, pickle.HIGHEST_PROTOCOL)
with open(constants.FEAT_DATA_DIR + 'pad_a_u_topic.pkl', 'wb') as f:
pickle.dump(u_topics, f, pickle.HIGHEST_PROTOCOL)
In [8]:
num_topic = 4
lda = LdaMulticore.load(constants.LDA_DIR + 'd_gensim_lda_%d'%num_topic)
pad = 'department_id'
pad_user = users_orders.groupby([pad])['user_id'].apply(utils.series_to_str)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_dict', 'rb') as f:
p_dict = pickle.load(f)
with open(constants.FEAT_DATA_DIR + pad[:-3] + '_gensim_tf', 'rb') as f:
p_term_matrix = pickle.load(f)
p_topics = [[v for k,v in lda.get_document_topics(p, minimum_phi_value=0, minimum_probability=0)] for p in p_term_matrix]
p_topics = pd.DataFrame(p_topics, columns = ['p_topic_%d'%n for n in range(num_topic)])
p_topics[pad] = pad_user.index.values
user_id = [int(token) for word_id, token in p_dict.iteritems()]
u_topics = [{k:v for k,v in lda.get_term_topics(word_id, minimum_probability=0)} for word_id, _ in p_dict.iteritems()]
u_topics = pd.DataFrame(u_topics).fillna(0)
# u_topics[4] = 0 # column 4 missing
u_topics.columns = ['u_topic_%d'%i for i in range(num_topic)]
u_topics = u_topics / u_topics.sum() # column normalization, make sure each topic sums to 1
u_topics = (u_topics.transpose() / u_topics.transpose().sum()).transpose() # row normalization
u_topics['user_id'] = user_id
with open(constants.FEAT_DATA_DIR + 'pad_d_topic.pkl', 'wb') as f:
pickle.dump(p_topics, f, pickle.HIGHEST_PROTOCOL)
with open(constants.FEAT_DATA_DIR + 'pad_d_u_topic.pkl', 'wb') as f:
pickle.dump(u_topics, f, pickle.HIGHEST_PROTOCOL)
In [53]:
ud = tle.craft_up_distance(filepath = ['pad_d_topic.pkl', 'pad_d_u_topic.pkl'], num_topic = 8, pad = 'department_id')
ud = tle.craft_up_distance(filepath = ['pad_a_topic.pkl', 'pad_a_u_topic.pkl'], num_topic = 10, pad = 'aisle_id')
ud = tle.craft_up_distance(filepath = ['pad_p_topic.pkl', 'pad_p_u_topic.pkl'], num_topic = 10, pad = 'product_id')
In [15]:
?? tle.craft_up_distance
In [ ]:
#方案3
order_product_names = pd.merge(order_products_all[['order_id', 'product_id']],
products[['product_id', 'product_name']],
on = ['product_id'],
how = 'left')
In [28]:
# 8mins
order_pnames_matrix = order_product_names.groupby(['order_id'])['product_name'].aggregate('sum')
In [29]:
order_pnames_matrix.head(5)
Out[29]:
In [93]:
%%time
tf = CountVectorizer(analyzer = 'word', min_df=10, token_pattern='(?u)\\b[a-zA-Z]\\w+\\b')
tf_matrix = tf.fit_transform(order_pnames_matrix.values)
tf_feature_names = tf.get_feature_names()
In [94]:
'crisp' in tf_feature_names
Out[94]:
In [95]:
tf_matrix.shape
Out[95]:
number of topics:10 log likelihood:-482209266.283346
number of topics:20 log likelihood:-739555562.008604
number of topics:30 log likelihood:-1005367108.142659
number of topics:40 log likelihood:-1293773643.990032
number of topics:50 log likelihood:-1578475501.104584
number of topics:60 log likelihood:-1891792320.600158
number of topics:70 log likelihood:-2185429399.135661
number of topics:80 log likelihood:-2499275522.415354
number of topics:90 log likelihood:-2814907367.346162
number of topics:100 log likelihood:-3124264684.650005
In [12]:
with open(DATA_DIR + 'user_tf_matrix', 'rb') as f:
user_tf_matrix = pickle.load(f)
In [4]:
%%time
n_topics = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
n_top_words = 10
scores = []
for n in n_topics:
print("number of topics:%d"%n)
with open(DATA_DIR + 'lda_%d.model'%n, 'rb') as f:
lda = pickle.load(f)
scores.append(lda.score(user_tf_matrix))
print("log likelihood:%f"%scores[-1])
#print_top_words(lda, tf_product_names, n_top_words)
In [13]:
max(scores)
Out[13]:
In [5]:
with open(constants.LDA_DIR + 'lda_22.model', 'rb') as f:
lda = pickle.load(f)
In [15]:
with open(constants.FEAT_DATA_DIR + 'user_tf_matrix_info.pkl', 'rb') as f:
tf_info = pickle.load(f)
In [17]:
with open(constants.FEAT_DATA_DIR + 'user_tf_matrix', 'rb') as f:
user_tf_matrix = pickle.load(f)
In [6]:
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print("\n".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
In [32]:
def csv_top_words(model, feature_names, n_top_words):
topic_content = {}
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
content = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
# print("\n".join(content))
topic_content['topic_%s'%topic_idx] = content
res = pd.DataFrame(topic_content)
res.to_csv('lda_topic_content.csv')
print()
return res
In [38]:
lda.components_[0][:10:1]
Out[38]:
In [17]:
print_top_words(lda, tf_info.pname, 20)
In [3]:
NUM_TOPIC = 22
tf_feature_names = tf_info.pid.values
with open(constants.LDA_DIR + 'lda_%d.model'%NUM_TOPIC, 'rb') as f:# replace 10
lda = pickle.load(f)
# 每个话题是一个商品上的分布, 除以topic的总比重得到每个单词对每个Topic的贡献比重
# np.newaxis turn (22,) into (22,1)
norm_comp = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
# 每个商品也可对应一个话题的分布,对每个单词的分布归一化使得概率和为1
norm_comp = norm_comp / norm_comp.sum(axis=0)
topic_product = pd.DataFrame(norm_comp.transpose(), columns = ["topic_%d"%x for x in range(NUM_TOPIC)])
topic_product['product_id'] = [int(x) for x in tf_feature_names]
user_topic = lda.transform(user_tf_matrix)
user_topic = pd.DataFrame(user_topic, columns = ["topic_%d"%x for x in range(NUM_TOPIC)])
user_id = users_products_matrix.index.values
user_topic['user_id'] = user_id
with open(DATA_DIR + 'user_topic_%d.pkl'%NUM_TOPIC, 'wb') as f:
pickle.dump(user_topic, f, pickle.HIGHEST_PROTOCOL)
with open(DATA_DIR + 'topic_product_%d.pkl'%NUM_TOPIC, 'wb') as f:
pickle.dump(topic_product, f, pickle.HIGHEST_PROTOCOL)
In [23]:
with open(DATA_DIR + 'user_topic_%d.pkl'%NUM_TOPIC, 'rb') as f:
user_topic = pickle.load(f)
with open(DATA_DIR + 'topic_product_%d.pkl'%NUM_TOPIC, 'rb') as f:
topic_product = pickle.load(f)
In [24]:
user_topic.head(5)
Out[24]:
In [34]:
topic_product.describe()
Out[34]:
avg_reorder_len, component normalize,欧氏距离时的扥分:f1score:0.11
avg_reorder_len, component normalize, 对称KL距离时的得分: f1score:0.108
In [26]:
with open(DATA_DIR + 'user_product.pkl', 'rb') as f:
user_product = pickle.load(f)
In [27]:
with open(DATA_DIR + 'user_reorder_est.pkl', 'rb') as f:
avg_reorder_est = pickle.load(f)
In [28]:
train_orders = orders[orders.eval_set == 'train']
In [29]:
#%%time
u_nnp = []
cnt = 0
for u in train_orders.user_id:
cnt += 1
if cnt % 10000 == 0:
print("Nearest Product Search for %dth user"%cnt)
# extract user u's topic
u_topic = user_topic[user_topic.user_id == u][["topic_%d"%x for x in range(10)]]
# extract user u's product list
u_products = user_product[user_product.user_id == u]['product_id']
# extract avg_reorder_num
u_reorder = avg_reorder_est[avg_reorder_est.user_id == u]['avg_reorder_num']
# extract products' topic
p_topics = topic_product[topic_product.product_id.isin(set(u_products.values[0]))]
p_topics_pid = p_topics['product_id'].reset_index()['product_id']
p_topics_vec = p_topics[["topic_%d"%x for x in range(10)]]
# nbr search, expand search scope
n_neighbors = 1 * int(np.ceil(u_reorder.values[0]))
if n_neighbors > len(p_topics_vec):
n_neighbors = len(p_topics_vec)
if n_neighbors > 0:
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
metric = 'l1',
algorithm = 'brute').fit(p_topics_vec.values)
#print(u)
distances, indices = nbrs.kneighbors(u_topic.values.reshape(1, -1))
#print(cnt)
u_nnp.append(list(p_topics_pid[indices[0]].values))
else:
u_nnp.append('None')
In [30]:
train_pred = pd.DataFrame({'user_id':train_orders.user_id,
'reorder_pids':u_nnp,
'order_id':train_orders.order_id})
train_gold = order_products_train[order_products_train.reordered == 1].groupby(['order_id'])['product_id'].apply(list)
train_gold = train_gold.reset_index()
train_eval = pd.merge(train_gold, train_pred, on = ['order_id'], how = 'outer').fillna('None')
train_eval.columns = ['order_id', 'gold_reorder', 'pred_reorder', 'user_id']
In [31]:
# 21mins
train_eval['f1score'] = train_eval.apply(wrap_cal_f1, axis = 1)
train_eval['precision'] = train_eval.apply(wrap_cal_precision, axis = 1)
train_eval['recall'] = train_eval.apply(wrap_cal_recall, axis = 1)
In [32]:
train_eval.f1score.mean()
Out[32]:
In [33]:
train_eval.describe()
Out[33]:
In [389]:
train_eval[train_eval.user_id == 201]
Out[389]:
In [378]:
prior_all = pd.merge(order_products_prior, orders, on = ['order_id'], how = 'left')
In [379]:
prior_all = pd.merge(prior_all,
product_ad[['product_id', 'product_name', 'aisle', 'department']],
on = ['product_id'],
how = 'left')
In [132]:
%%time
order_topic = lda_10.transform(order_tf_matrix)
In [130]:
with open(DATA_DIR + 'tf_matrix', 'rb') as f:
order_tf_matrix = pickle.load(f)
In [144]:
order_products_matrix = order_products_matrix.reset_index()
In [152]:
order_topic = pd.DataFrame(order_topic, columns = ["topic_%d"%x for x in range(10)])
In [153]:
Out[153]:
In [ ]: