In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
In [141]:
# Randomly generated customer IDs and purchased months
df = pd.DataFrame([
{'customer': customer_id, 'month': month}
for month, customer_id in
zip(np.random.randint(1, 13, 100), np.random.randint(1, 31, 100))
])
In [142]:
cohort = df.sort_values(by='month').groupby('customer')['month'].first()
cohort = cohort.to_frame(name='cohort')
In [143]:
cohort.head()
Out[143]:
In [144]:
df_cohort = pd.merge(df, cohort, right_index=True, left_on='customer')
df_cohort.head()
Out[144]:
In [145]:
df_cohort.groupby('cohort').size()
Out[145]:
In [202]:
def cohort_generator(df, group_by, customer_column, period_column):
results = []
for cohort, data in df.groupby(group_by):
cohort_size = data[customer_column].nunique()
unique_periods = sorted(df[period_column].unique())
for period in unique_periods:
unique = data[data[period_column] >= period][customer_column].nunique()
if period >= cohort and unique >= 0:
retention = unique / cohort_size
results.append({'cohort': cohort,
'period': period,
'retention': retention})
c = pd.DataFrame.from_records(results)
return c.pivot(index='period', columns='cohort')
In [163]:
cohort_generator(df_cohort, 'cohort', 'customer', 'month')
Out[163]:
In [7]:
orders = pd.read_csv('orders.csv')[['customer_id', 'created_at']]
In [8]:
orders['created_at'] = pd.to_datetime(orders['created_at'])
In [226]:
orders.head()
Out[226]:
In [9]:
%%time
# Extract year and date only
orders['created_at_month_year'] = orders['created_at'].map(lambda x: x.strftime('%Y-%m'))
In [228]:
orders.head()
Out[228]:
In [10]:
%%time
# Assign a cohort to each
cohort = orders.sort_values(by='created_at_month_year').groupby('customer_id')['created_at_month_year'].first()
cohort = cohort.to_frame(name='cohort')
In [230]:
cohort.head()
Out[230]:
In [11]:
%%time
df_cohort = pd.merge(orders[['customer_id',
'created_at_month_year']],
cohort, right_index=True, left_on='customer_id')
assert len(orders) == len(df_cohort)
In [232]:
df_cohort.head()
Out[232]:
In [239]:
# Number of unique customers in each cohort
df_cohort.groupby('cohort').agg({'customer_id': pd.Series.nunique}).head()
Out[239]:
In [235]:
%%time
cohort = cohort_generator(df_cohort, 'cohort', 'customer_id', 'created_at_month_year')
In [236]:
cohort
Out[236]:
In [2]:
orders = pd.read_csv('orders.csv')[['id', 'customer_id', 'created_at']]
orders.head()
Out[2]:
In [3]:
items = pd.read_csv('order_items.csv')[['order_id', 'product_id', 'quantity']]
items.head()
Out[3]:
In [4]:
# No order has more than one item
items.order_id.value_counts().head()
Out[4]:
In [5]:
products = pd.read_csv('products.csv')[['id', 'price']]
products.head()
Out[5]:
In [6]:
items_products = pd.merge(items,
products,
how='left',
left_on='product_id',
right_on='id')[['order_id', 'quantity', 'price']]
items_products['total'] = items_products.quantity * items_products.price
items_products = items_products[['order_id', 'total']]
items_products.head()
Out[6]:
In [7]:
combined = pd.merge(orders, items_products, how='left', left_on='id', right_on='order_id')
assert len(orders) == len(combined)
combined = combined[['customer_id', 'created_at', 'total']]
combined.head()
Out[7]:
In [8]:
%%time
combined['created_at'] = pd.to_datetime(combined['created_at'])
In [9]:
%%time
# Extract year and date only
combined['created_at_month_year'] = combined['created_at'].map(lambda x: x.strftime('%Y-%m'))
In [10]:
%%time
# Assign a cohort to each
cohort = combined.sort_values(by='created_at_month_year').groupby('customer_id')['created_at_month_year'].first()
cohort = cohort.to_frame(name='cohort')
In [11]:
cohort.head()
Out[11]:
In [12]:
%%time
df_cohort = pd.merge(combined[['customer_id',
'created_at_month_year', 'total']],
cohort, right_index=True, left_on='customer_id')
assert len(orders) == len(df_cohort)
In [13]:
df_cohort.head()
Out[13]:
In [14]:
revenue = df_cohort.groupby(['cohort', 'created_at_month_year']).agg({'total': pd.np.sum})
revenue.head()
Out[14]:
In [58]:
revenue.reset_index().rename(columns={
'created_at_month_year': 'period',
'total': 'revenue'
}).pivot(index='cohort', columns='period')
Out[58]:
In [19]:
def cohort_revenue_generator(df):
results = []
for cohort, data in df.groupby('cohort'):
cohort_size = data['customer_id'].nunique()
unique_periods = sorted(df['created_at_month_year'].unique())
for period in unique_periods:
revenue = data[data['created_at_month_year'] >= period].total.sum()
if period >= cohort and revenue >= 0:
results.append({'cohort': cohort,
'period': period,
'revenue': revenue})
c = pd.DataFrame.from_records(results)
return c.pivot(index='period', columns='cohort')
In [20]:
cohort_revenue_generator(df_cohort)
Out[20]:
In [36]:
orders = pd.read_csv('/Users/amir.ziai/Downloads/data/orders.csv')
orders[orders.id == 1]
Out[36]:
In [37]:
items = pd.read_csv('/Users/amir.ziai/Downloads/data/order_items.csv')
items[items.order_id == 1]
Out[37]:
In [30]:
pd.read_csv('/Users/amir.ziai/Downloads/data/products.csv', nrows=10)
Out[30]:
In [28]:
pd.read_csv('/Users/amir.ziai/Downloads/data/order_items.csv', nrows=10)
Out[28]:
In [27]:
pd.read_csv('/Users/amir.ziai/Downloads/data/customers.csv', nrows=10)
Out[27]:
In [ ]: