In [158]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [159]:
aisles = pd.read_csv('data/aisles.csv', index_col='aisle_id').to_dict()['aisle']
depart = pd.read_csv('data/departments.csv', index_col='department_id').to_dict()['department']
products_df = pd.read_csv('data/products.csv', index_col='product_id')
products_df['aisle'] = products_df.aisle_id.map(lambda x: aisles[x])
products_df['department'] = products_df.department_id.map(lambda x: depart[x])
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)

In [160]:
prior_df = pd.read_csv('data/order_products__prior.csv', index_col='order_id')
train_df = pd.read_csv('data/order_products__train.csv', index_col='order_id')
orders_df = pd.read_csv('data/orders.csv', index_col='order_id')

In [161]:
products_df.head()


Out[161]:
product_name aisle department
product_id
1 Chocolate Sandwich Cookies cookies cakes snacks
2 All-Seasons Salt spices seasonings pantry
3 Robust Golden Unsweetened Oolong Tea tea beverages
4 Smart Ones Classic Favorites Mini Rigatoni Wit... frozen meals frozen
5 Green Chile Anytime Sauce marinades meat preparation pantry

In [162]:
prior_df.head()


Out[162]:
product_id add_to_cart_order reordered
order_id
2 33120 1 1
2 28985 2 1
2 9327 3 0
2 45918 4 1
2 30035 5 0

In [163]:
orders_df.head()


Out[163]:
user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
order_id
2539329 1 prior 1 2 8 NaN
2398795 1 prior 2 3 7 15.0
473747 1 prior 3 3 12 21.0
2254736 1 prior 4 4 7 29.0
431534 1 prior 5 4 15 28.0

In [235]:
def get_order_to_products(df):
    tmp = df.product_id * df.reordered
    return tmp.groupby(tmp.index).aggregate({'products': lambda x: tuple(e for e in x if e > 0)})

def get_user_to_last_order(orders_df):
    return orders_df[orders_df.eval_set=='prior'].reset_index().groupby('user_id').last()['order_id'].to_frame()

def calc_f1_score(exp, act):
    exp = set(exp)
    act = set(act)
    com = exp & act
    p = 1.0 * len(com) / len(act) if act else 1.0
    r = 1.0 * len(com) / len(exp) if exp else 1.0
    return 2.0 * p * r / (p + r) if (p + r) > 0 else 0.0

def get_score(expected, actual):
    score_df = pd.merge(
        expected, actual,
        left_index=True, right_index=True,
        suffixes=['_exp', '_act']
    )
    scores = score_df.apply(lambda x: calc_f1_score(x.get('products_exp'), x.get('products_act')), axis=1)   
    return scores.mean()

In [226]:
# Expensive computations.
order_to_products = get_order_to_products(prior_df)

In [236]:
def solution_1(order_to_products, prior_df, orders_df, label):
    """Make the same basket prediction as the last purchase of a user."""
    user_to_last_order = get_user_to_last_order(orders_df)
    user_to_last_products = pd.merge(
        user_to_last_order, order_to_products,
        left_on='order_id', right_index=True, how='inner'
    )
    
    predict_df = orders_df[orders_df.eval_set==label]['user_id'].to_frame()
    predictions = pd.merge(
        predict_df, user_to_last_products,
        left_on='user_id', right_index=True, how='inner'
    )
    return predictions['products'].to_frame()

train_actual = solution_1(order_to_products, prior_df, orders_df, 'train')
train_expected = get_order_to_products(train_df)
print('Solution 1 train validation score: {}'.format(get_score(train_expected, train_actual)))

test_actual = solution_1(order_to_products, prior_df, orders_df, 'test')
test_actual = test_actual.products.apply(lambda t: ' '.join(str(e) for e in t) if t else 'None').to_frame()
test_actual.to_csv("test_output.csv")


Solution 1 train validation score: 0.32558651711

In [ ]: