In [3]:
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import calendar
In [4]:
# First, let's import requisite files
orders = pd.read_csv('../Instacart_Input/orders.csv')
prior_set = pd.read_csv('../Instacart_Input/order_products__prior.csv')
train_set = pd.read_csv('../Instacart_Input/order_products__train.csv')
aisles = pd.read_csv('../Instacart_Input/aisles.csv')
departments = pd.read_csv('../Instacart_Input/departments.csv')
products = pd.read_csv('../Instacart_Input/products.csv')
In [5]:
test = orders[orders['eval_set'] == 'test']
test.head()
Out[5]:
In [6]:
# order history for all user_ids in test set
test_user_orders = orders[orders['user_id'].isin(test['user_id'].values)]
test_user_orders = test_user_orders.merge(prior_set, on='order_id')
test_user_orders.sort_values(by='user_id').head()
Out[6]:
In [7]:
# Let's add in department and aisle data, now that we have products
products_df = pd.merge(left=pd.merge(products, right=departments, on='department_id'),
right=aisles, on='aisle_id')
products_df = products_df.drop(['department_id', 'aisle_id'], axis=1)
test_user_orders = test_user_orders.merge(products_df, on='product_id')
test_user_orders.head()
Out[7]:
In [8]:
# For each customer, create an array of order_ids,
# and for each order_id, an array of products
user_orderlist = pd.DataFrame(test_user_orders.groupby('user_id')
['order_id'].apply(list))
order_productlist = pd.DataFrame(test_user_orders.groupby('order_id')
['product_id'].apply(list))
user_orderlist.head()
Out[8]:
In [9]:
# prettify dataframes
user_orderlist.reset_index(level='user_id', inplace=True)
user_orderlist.columns = ['user_id', 'orders_array']
order_productlist.reset_index(level='order_id', inplace=True)
order_productlist.columns = ['order_id', 'products_array']
order_productlist.head()
Out[9]:
In [10]:
# Let's add department and aisle lists for each order
order_categorylist = pd.DataFrame(test_user_orders.groupby('order_id')
['department'].apply(list).apply(np.unique))
order_sub_categorylist = pd.DataFrame(test_user_orders.groupby('order_id')
['aisle'].apply(list).apply(np.unique))
In [11]:
# Prettify DataFrames
order_categorylist.reset_index(level='order_id', inplace=True)
order_categorylist.columns = ['order_id', 'departments_array']
order_sub_categorylist.reset_index(level='order_id', inplace=True)
order_sub_categorylist.columns = ['order_id', 'aisles_array']
order_categorylist.head()
Out[11]:
In [12]:
# Let's calculate reorder rates for each order.
# First, we need to make the 'reordered' column more accessible
order_reorderlist = pd.DataFrame(test_user_orders.groupby('order_id')
['reordered'].apply(list))
# Prettify DataFrames
order_reorderlist.reset_index(level='order_id', inplace=True)
order_reorderlist.columns = ['order_id', 'reordered_array']
order_reorderlist.head()
Out[12]:
In [13]:
# Now, we can add reorder rates and product counts for each order
order_reorderlist['reorder_rate'] = 0.0
order_reorderlist['product_count'] = 0
for i in range(len(order_reorderlist)):
reorderlist = order_reorderlist['reordered_array'][i]
order_reorderlist.set_value(i, 'product_count', len(reorderlist))
if 1 in reorderlist:
order_reorderlist.set_value(i, 'reorder_rate', round(np.sum(reorderlist) / float(len(reorderlist))*100, 2))
order_reorderlist.tail()
Out[13]:
In [14]:
# Let's create a master DataFrame that centralizes
# all order data and newly minted meta-data
testgroup_priororders = orders[orders['user_id'].isin(test['user_id'].values)]
testgroup_priororders = testgroup_priororders[testgroup_priororders['eval_set'] == 'prior']
master_orders_df = pd.merge(testgroup_priororders,
pd.merge(order_reorderlist,
order_productlist,
on='order_id'),
on='order_id')
master_orders_df = pd.merge(master_orders_df, pd.merge(order_sub_categorylist,
order_categorylist,
on='order_id'),
on='order_id')
master_orders_df.head()
Out[14]:
In [15]:
# Complete list of previously purchased products for each user
# (includes duplicates!)
user_products = pd.DataFrame(test_user_orders.groupby('user_id')['product_id'].size())
# Prettify DataFrame
user_products.reset_index(level='user_id', inplace=True)
user_products.columns = ['user_id', 'total_products']
# Add total number of orders & averages
temp = orders[orders['user_id'].isin(test['user_id'].values)]
temp = temp[temp['eval_set'] == 'prior']
user_orders_meta = pd.DataFrame(temp.loc[:,('user_id', 'order_number')].groupby(['user_id']).size()).reset_index()
user_orders_meta = user_orders_meta.merge(user_products, on='user_id')
# Prettify DataFrame
user_orders_meta.columns = ['user_id', 'total_orders', 'total_products']
user_orders_meta['avg_products_per_order'] = (user_orders_meta['total_products'] / user_orders_meta['total_orders'])
user_orders_meta.head(10)
Out[15]:
In [16]:
# Lists of products and order frequency for each user
customer_product_counts = pd.DataFrame(test_user_orders.groupby('user_id')
['product_id'].value_counts())
# Prettify DataFrame
customer_product_counts.columns = ['order_instances']
customer_product_counts.head(35)
Out[16]:
In [18]:
# For each user, groups of order_numbers associated with each product_id
customer_orders_assortment = pd.DataFrame(test_user_orders.groupby(['user_id', 'product_id'])
['order_number'].value_counts())
# Prettify DataFrame
customer_orders_assortment = customer_orders_assortment.drop(['order_number'], axis=1)
customer_orders_assortment.head(15)
Out[18]:
In [19]:
# The above DataFrame is visually appealing,
# but I'll update it for easier data manipulation
customer_orders_assortment = customer_orders_assortment.reset_index()
customer_orders_assortment.head(15)
Out[19]:
In [205]:
# Let's add product inclusion rates
# Some setup required
customer_orders_assortment['product_inclusion_rate'] = 0.0
user_temp_list = np.unique(customer_orders_assortment['user_id'])
#new = pd.DataFrame()
for i in user_temp_list[65053:]:
user_df = customer_orders_assortment[customer_orders_assortment['user_id'] == [i]]
for x in user_df.index.values:
user_df.set_value(x,
'product_inclusion_rate',
(user_df[user_df['product_id'] == user_df['product_id'][x]]['product_id'].count() / float(user_df['order_number'].max())
)
)
if len(new) < len(user_df):
new = user_df
else:
new = pd.concat([new, user_df])
new
Out[205]:
In [345]:
# let's jump into user_id 3 to explore trends in order behavior
# Let's start with product counts across orders
user_3_count = new[new['user_id'] == 3]['order_number'].value_counts().sort_index()
f, ax = plt.subplots(figsize=(14,5), ncols=1)
user_3_count.plot(kind='bar')
_= ax.set_title('Products per Order\nUser_id 3', size=22)
_= ax.set_ylabel('Product Count', size=20)
_= ax.set_xlabel('order_number', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(ha='right', rotation=55);
user_3_count.describe()
Out[345]:
In [321]:
# Now, let's check out how individual products compare in terms of frequency of reorder.
user_3 = new[new['user_id'] == 3].groupby(['product_inclusion_rate',
'product_id'
]).size()
f, ax = plt.subplots(figsize=(14,10), ncols=1)
user_3.plot(kind='bar')
_= ax.set_title('Orders per Product\nUser_id 3', size=22)
_= ax.set_ylabel('Order Count', size=20)
_= ax.set_xlabel('product_id', size=20)
_= ax.tick_params(labelsize=16)
plt.xticks(range(len(user_3)), user_3.index.get_level_values(level=1), ha='right', rotation=55);
In [207]:
#new1.to_csv('inclusion_rate.csv', index=False)
In [ ]: