In [158]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [159]:
aisles = pd.read_csv('data/aisles.csv', index_col='aisle_id').to_dict()['aisle']
depart = pd.read_csv('data/departments.csv', index_col='department_id').to_dict()['department']
products_df = pd.read_csv('data/products.csv', index_col='product_id')
products_df['aisle'] = products_df.aisle_id.map(lambda x: aisles[x])
products_df['department'] = products_df.department_id.map(lambda x: depart[x])
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)
In [160]:
prior_df = pd.read_csv('data/order_products__prior.csv', index_col='order_id')
train_df = pd.read_csv('data/order_products__train.csv', index_col='order_id')
orders_df = pd.read_csv('data/orders.csv', index_col='order_id')
In [161]:
products_df.head()
Out[161]:
In [162]:
prior_df.head()
Out[162]:
In [163]:
orders_df.head()
Out[163]:
In [235]:
def get_order_to_products(df):
tmp = df.product_id * df.reordered
return tmp.groupby(tmp.index).aggregate({'products': lambda x: tuple(e for e in x if e > 0)})
def get_user_to_last_order(orders_df):
return orders_df[orders_df.eval_set=='prior'].reset_index().groupby('user_id').last()['order_id'].to_frame()
def calc_f1_score(exp, act):
exp = set(exp)
act = set(act)
com = exp & act
p = 1.0 * len(com) / len(act) if act else 1.0
r = 1.0 * len(com) / len(exp) if exp else 1.0
return 2.0 * p * r / (p + r) if (p + r) > 0 else 0.0
def get_score(expected, actual):
score_df = pd.merge(
expected, actual,
left_index=True, right_index=True,
suffixes=['_exp', '_act']
)
scores = score_df.apply(lambda x: calc_f1_score(x.get('products_exp'), x.get('products_act')), axis=1)
return scores.mean()
In [226]:
# Expensive computations.
order_to_products = get_order_to_products(prior_df)
In [236]:
def solution_1(order_to_products, prior_df, orders_df, label):
"""Make the same basket prediction as the last purchase of a user."""
user_to_last_order = get_user_to_last_order(orders_df)
user_to_last_products = pd.merge(
user_to_last_order, order_to_products,
left_on='order_id', right_index=True, how='inner'
)
predict_df = orders_df[orders_df.eval_set==label]['user_id'].to_frame()
predictions = pd.merge(
predict_df, user_to_last_products,
left_on='user_id', right_index=True, how='inner'
)
return predictions['products'].to_frame()
train_actual = solution_1(order_to_products, prior_df, orders_df, 'train')
train_expected = get_order_to_products(train_df)
print('Solution 1 train validation score: {}'.format(get_score(train_expected, train_actual)))
test_actual = solution_1(order_to_products, prior_df, orders_df, 'test')
test_actual = test_actual.products.apply(lambda t: ' '.join(str(e) for e in t) if t else 'None').to_frame()
test_actual.to_csv("test_output.csv")
In [ ]: