In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

pd.set_option('max_columns', 50)
mpl.rcParams['lines.linewidth'] = 2

%matplotlib inline



In [2]:

    
df = pd.read_excel('/Users/gjreda/Dropbox/datasets/relay-foods.xlsx')
df.head()









    Out[2]:






  
    
      
      OrderId
      OrderDate
      UserId
      TotalCharges
      CommonId
      PupId
      PickupDate
    
  
  
    
      0
      262
      2009-01-11
      47
      50.67
      TRQKD
      2
      2009-01-12
    
    
      1
      278
      2009-01-20
      47
      26.60
      4HH2S
      3
      2009-01-20
    
    
      2
      294
      2009-02-03
      47
      38.71
      3TRDC
      2
      2009-02-04
    
    
      3
      301
      2009-02-06
      47
      53.38
      NGAZJ
      2
      2009-02-09
    
    
      4
      302
      2009-02-06
      47
      14.28
      FFYHD
      2
      2009-02-09

1. Create a period column based on the OrderDate

Since we're doing monthly cohorts, we'll be looking at the total monthly behavior of our users. Therefore, we don't want granular OrderDate data (right now).



In [3]:

    
df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m'))
df.head()









    Out[3]:






  
    
      
      OrderId
      OrderDate
      UserId
      TotalCharges
      CommonId
      PupId
      PickupDate
      OrderPeriod
    
  
  
    
      0
      262
      2009-01-11
      47
      50.67
      TRQKD
      2
      2009-01-12
      2009-01
    
    
      1
      278
      2009-01-20
      47
      26.60
      4HH2S
      3
      2009-01-20
      2009-01
    
    
      2
      294
      2009-02-03
      47
      38.71
      3TRDC
      2
      2009-02-04
      2009-02
    
    
      3
      301
      2009-02-06
      47
      53.38
      NGAZJ
      2
      2009-02-09
      2009-02
    
    
      4
      302
      2009-02-06
      47
      14.28
      FFYHD
      2
      2009-02-09
      2009-02

2. Determine the user's cohort group (based on their first order)

Create a new column called CohortGroup, which is the year and month in which the user's first purchase occurred.



In [4]:

    
df.set_index('UserId', inplace=True)

df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x: x.strftime('%Y-%m'))
df.reset_index(inplace=True)
df.head()









    Out[4]:






  
    
      
      UserId
      OrderId
      OrderDate
      TotalCharges
      CommonId
      PupId
      PickupDate
      OrderPeriod
      CohortGroup
    
  
  
    
      0
      47
      262
      2009-01-11
      50.67
      TRQKD
      2
      2009-01-12
      2009-01
      2009-01
    
    
      1
      47
      278
      2009-01-20
      26.60
      4HH2S
      3
      2009-01-20
      2009-01
      2009-01
    
    
      2
      47
      294
      2009-02-03
      38.71
      3TRDC
      2
      2009-02-04
      2009-02
      2009-01
    
    
      3
      47
      301
      2009-02-06
      53.38
      NGAZJ
      2
      2009-02-09
      2009-02
      2009-01
    
    
      4
      47
      302
      2009-02-06
      14.28
      FFYHD
      2
      2009-02-09
      2009-02
      2009-01

3. Rollup data by CohortGroup & OrderPeriod

Since we're looking at monthly cohorts, we need to aggregate users, orders, and amount spent by the CohortGroup within the month (OrderPeriod).



In [5]:

    
grouped = df.groupby(['CohortGroup', 'OrderPeriod'])

# count the unique users, orders, and total revenue per Group + Period
cohorts = grouped.agg({'UserId': pd.Series.nunique,
                       'OrderId': pd.Series.nunique,
                       'TotalCharges': np.sum})

# make the column names more meaningful
cohorts.rename(columns={'UserId': 'TotalUsers',
                        'OrderId': 'TotalOrders'}, inplace=True)
cohorts.head()









    Out[5]:






  
    
      
      
      TotalOrders
      TotalUsers
      TotalCharges
    
    
      CohortGroup
      OrderPeriod
      
      
      
    
  
  
    
      2009-01
      2009-01
      30
      22
      1850.255
    
    
      2009-02
      25
      8
      1351.065
    
    
      2009-03
      26
      10
      1357.360
    
    
      2009-04
      28
      9
      1604.500
    
    
      2009-05
      26
      10
      1575.625

4. Label the CohortPeriod for each CohortGroup

We want to look at how each cohort has behaved in the months following their first purchase, so we'll need to index each cohort to their first purchase month. For example, CohortPeriod = 1 will be the cohort's first month, CohortPeriod = 2 is their second, and so on.

This allows us to compare cohorts across various stages of their lifetime.



In [6]:

    
def cohort_period(df):
    """
    Creates a `CohortPeriod` column, which is the Nth period based on the user's first purchase.
    
    Example
    -------
    Say you want to get the 3rd month for every user:
        df.sort(['UserId', 'OrderTime', inplace=True)
        df = df.groupby('UserId').apply(cohort_period)
        df[df.CohortPeriod == 3]
    """
    df['CohortPeriod'] = np.arange(len(df)) + 1
    return df

cohorts = cohorts.groupby(level=0).apply(cohort_period)
cohorts.head()









    Out[6]:






  
    
      
      
      TotalOrders
      TotalUsers
      TotalCharges
      CohortPeriod
    
    
      CohortGroup
      OrderPeriod
      
      
      
      
    
  
  
    
      2009-01
      2009-01
      30
      22
      1850.255
      1
    
    
      2009-02
      25
      8
      1351.065
      2
    
    
      2009-03
      26
      10
      1357.360
      3
    
    
      2009-04
      28
      9
      1604.500
      4
    
    
      2009-05
      26
      10
      1575.625
      5

5. Make sure we did all that right

Let's test data points from the original DataFrame with their corresponding values in the new cohorts DataFrame to make sure all our data transformations worked as expected. As long as none of these raise an exception, we're good.



In [7]:

    
x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-01')]
y = cohorts.ix[('2009-01', '2009-01')]

assert(x['UserId'].nunique() == y['TotalUsers'])
assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))
assert(x['OrderId'].nunique() == y['TotalOrders'])

x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-09')]
y = cohorts.ix[('2009-01', '2009-09')]

assert(x['UserId'].nunique() == y['TotalUsers'])
assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))
assert(x['OrderId'].nunique() == y['TotalOrders'])

x = df[(df.CohortGroup == '2009-05') & (df.OrderPeriod == '2009-09')]
y = cohorts.ix[('2009-05', '2009-09')]

assert(x['UserId'].nunique() == y['TotalUsers'])
assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))
assert(x['OrderId'].nunique() == y['TotalOrders'])

User Retention by Cohort Group

We want to look at the percentage change of each CohortGroup over time -- not the absolute change.

To do this, we'll first need to create a pandas Series containing each CohortGroup and its size.



In [8]:

    
# reindex the DataFrame
cohorts.reset_index(inplace=True)
cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True)

# create a Series holding the total size of each CohortGroup
cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first()
cohort_group_size.head()









    Out[8]:





CohortGroup
2009-01    22
2009-02    15
2009-03    13
2009-04    39
2009-05    50
Name: TotalUsers, dtype: int64

Now, we'll need to divide the TotalUsers values in cohorts by cohort_group_size. Since DataFrame operations are performed based on the indices of the objects, we'll use unstack on our cohorts DataFrame to create a matrix where each column represents a CohortGroup and each row is the CohortPeriod corresponding to that group.

To illustrate what unstack does, recall the first five TotalUsers values:



In [9]:

    
cohorts['TotalUsers'].head()









    Out[9]:





CohortGroup  CohortPeriod
2009-01      1               22
             2                8
             3               10
             4                9
             5               10
Name: TotalUsers, dtype: int64

And here's what they look like when we unstack the CohortGroup level from the index:



In [10]:

    
cohorts['TotalUsers'].unstack(0).head()









    Out[10]:






  
    
      CohortGroup
      2009-01
      2009-02
      2009-03
      2009-04
      2009-05
      2009-06
      2009-07
      2009-08
      2009-09
      2009-10
      2009-11
      2009-12
      2010-01
      2010-02
      2010-03
    
    
      CohortPeriod
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      22
      15
      13
      39
      50
      32
      50
      31
      37
      54
      130
      65
      95
      100
      24
    
    
      2
      8
      3
      4
      13
      13
      15
      23
      11
      15
      17
      32
      17
      50
      19
      NaN
    
    
      3
      10
      5
      5
      10
      12
      9
      13
      9
      14
      12
      26
      18
      26
      NaN
      NaN
    
    
      4
      9
      1
      4
      13
      5
      6
      10
      7
      8
      13
      29
      7
      NaN
      NaN
      NaN
    
    
      5
      10
      4
      1
      6
      4
      7
      11
      6
      13
      13
      13
      NaN
      NaN
      NaN
      NaN

Now, we can utilize broadcasting to divide each column by the corresponding cohort_group_size.

The resulting DataFrame, user_retention, contains the percentage of users from the cohort purchasing within the given period. For instance, 38.4% of users in the 2009-03 purchased again in month 3 (which would be May 2009).



In [11]:

    
user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size, axis=1)
user_retention.head(10)









    Out[11]:






  
    
      CohortGroup
      2009-01
      2009-02
      2009-03
      2009-04
      2009-05
      2009-06
      2009-07
      2009-08
      2009-09
      2009-10
      2009-11
      2009-12
      2010-01
      2010-02
      2010-03
    
    
      CohortPeriod
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1.000000
      1.000000
      1.000000
      1.000000
      1.00
      1.00000
      1.00
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.00
      1
    
    
      2
      0.363636
      0.200000
      0.307692
      0.333333
      0.26
      0.46875
      0.46
      0.354839
      0.405405
      0.314815
      0.246154
      0.261538
      0.526316
      0.19
      NaN
    
    
      3
      0.454545
      0.333333
      0.384615
      0.256410
      0.24
      0.28125
      0.26
      0.290323
      0.378378
      0.222222
      0.200000
      0.276923
      0.273684
      NaN
      NaN
    
    
      4
      0.409091
      0.066667
      0.307692
      0.333333
      0.10
      0.18750
      0.20
      0.225806
      0.216216
      0.240741
      0.223077
      0.107692
      NaN
      NaN
      NaN
    
    
      5
      0.454545
      0.266667
      0.076923
      0.153846
      0.08
      0.21875
      0.22
      0.193548
      0.351351
      0.240741
      0.100000
      NaN
      NaN
      NaN
      NaN
    
    
      6
      0.363636
      0.266667
      0.153846
      0.179487
      0.12
      0.15625
      0.20
      0.258065
      0.243243
      0.129630
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      7
      0.363636
      0.266667
      0.153846
      0.102564
      0.06
      0.09375
      0.22
      0.129032
      0.216216
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      8
      0.318182
      0.333333
      0.230769
      0.153846
      0.10
      0.09375
      0.14
      0.129032
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      9
      0.318182
      0.333333
      0.153846
      0.051282
      0.10
      0.31250
      0.14
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      10
      0.318182
      0.266667
      0.076923
      0.102564
      0.08
      0.09375
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN

Finally, we can plot the cohorts over time in an effort to spot behavioral differences or similarities. Two common cohort charts are line graphs and heatmaps, both of which are shown below.

Notice that the first period of each cohort is 100% -- this is because our cohorts are based on each user's first purchase, meaning everyone in the cohort purchased in month 1.



In [12]:

    
user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5))
plt.title('Cohorts: User Retention')
plt.xticks(np.arange(1, 12.1, 1))
plt.xlim(1, 12)
plt.ylabel('% of Cohort Purchasing');



In [13]:

    
# Creating heatmaps in matplotlib is more difficult than it should be.
# Thankfully, Seaborn makes them easy for us.
# http://stanford.edu/~mwaskom/software/seaborn/

import seaborn as sns
sns.set(style='white')

plt.figure(figsize=(12, 8))
plt.title('Cohorts: User Retention')
sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True, fmt='.0%');

Unsurprisingly, we can see from the above chart that fewer users tend to purchase as time goes on.

However, we can also see that the 2009-01 cohort is the strongest, which enables us to ask targeted questions about this cohort compared to others -- what other attributes (besides first purchase month) do these users share which might be causing them to stick around? How were the majority of these users acquired? Was there a specific marketing campaign that brought them in? Did they take advantage of a promotion at sign-up? The answers to these questions would inform future marketing and product efforts.

	OrderId	OrderDate	UserId	TotalCharges	CommonId	PupId	PickupDate
0	262	2009-01-11	47	50.67	TRQKD	2	2009-01-12
1	278	2009-01-20	47	26.60	4HH2S	3	2009-01-20
2	294	2009-02-03	47	38.71	3TRDC	2	2009-02-04
3	301	2009-02-06	47	53.38	NGAZJ	2	2009-02-09
4	302	2009-02-06	47	14.28	FFYHD	2	2009-02-09

		TotalOrders	TotalUsers	TotalCharges
CohortGroup	OrderPeriod
2009-01	2009-01	30	22	1850.255
	2009-02	25	8	1351.065
	2009-03	26	10	1357.360
	2009-04	28	9	1604.500
	2009-05	26	10	1575.625

CohortGroup	2009-01	2009-02	2009-03	2009-04	2009-05	2009-06	2009-07	2009-08	2009-09	2009-10	2009-11	2009-12	2010-01	2010-02	2010-03
CohortPeriod
1	22	15	13	39	50	32	50	31	37	54	130	65	95	100	24
2	8	3	4	13	13	15	23	11	15	17	32	17	50	19	NaN
3	10	5	5	10	12	9	13	9	14	12	26	18	26	NaN	NaN
4	9	1	4	13	5	6	10	7	8	13	29	7	NaN	NaN	NaN
5	10	4	1	6	4	7	11	6	13	13	13	NaN	NaN	NaN	NaN

CohortGroup	2009-01	2009-02	2009-03	2009-04	2009-05	2009-06	2009-07	2009-08	2009-09	2009-10	2009-11	2009-12	2010-01	2010-02	2010-03
CohortPeriod
1	1.000000	1.000000	1.000000	1.000000	1.00	1.00000	1.00	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.00	1
2	0.363636	0.200000	0.307692	0.333333	0.26	0.46875	0.46	0.354839	0.405405	0.314815	0.246154	0.261538	0.526316	0.19	NaN
3	0.454545	0.333333	0.384615	0.256410	0.24	0.28125	0.26	0.290323	0.378378	0.222222	0.200000	0.276923	0.273684	NaN	NaN
4	0.409091	0.066667	0.307692	0.333333	0.10	0.18750	0.20	0.225806	0.216216	0.240741	0.223077	0.107692	NaN	NaN	NaN
5	0.454545	0.266667	0.076923	0.153846	0.08	0.21875	0.22	0.193548	0.351351	0.240741	0.100000	NaN	NaN	NaN	NaN
6	0.363636	0.266667	0.153846	0.179487	0.12	0.15625	0.20	0.258065	0.243243	0.129630	NaN	NaN	NaN	NaN	NaN
7	0.363636	0.266667	0.153846	0.102564	0.06	0.09375	0.22	0.129032	0.216216	NaN	NaN	NaN	NaN	NaN	NaN
8	0.318182	0.333333	0.230769	0.153846	0.10	0.09375	0.14	0.129032	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	0.318182	0.333333	0.153846	0.051282	0.10	0.31250	0.14	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10	0.318182	0.266667	0.076923	0.102564	0.08	0.09375	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN