In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.ticker as ticker
import calendar
In [2]:
mkdir -p viz
In [3]:
df = pd.read_pickle('out/21/donations.pkl')
events = pd.read_pickle('out/41/events.pkl')
In [4]:
df.columns
Out[4]:
In [5]:
df = df[['donor_id','activity_year', 'amount', 'is_service','channel', 'city','county','state','activity_date','activity_ym','activity_month','appeal','campaign_location_id','campaign_month_id']]
In [6]:
def get_data(rows):
'''
input: rows from dataframe for a specific donor
output: money donated and contributed over the years
'''
return rows\
.groupby(['activity_year', 'activity_month', 'is_service', 'state'])\
.agg({'amount': sum}).reset_index()
In [7]:
df[(df.donor_id=='_1D50SWTKX') & (df.activity_year == 2014)].reset_index()
Out[7]:
In [8]:
donor_data = df\
.groupby('donor_id')\
.apply(get_data)
donor_data.index = donor_data.index.droplevel(1)
donor_data = donor_data.reset_index()
In [9]:
year_of_donation = donor_data[(donor_data.is_service==False)]\
.groupby('donor_id')['activity_year']\
.rank(method='dense')
year_of_donation.name = 'year_of_donation'
year_of_contribution = donor_data[(donor_data.is_service==True)]\
.groupby('donor_id')['activity_year']\
.rank(method='dense')
year_of_contribution.name = 'year_of_contribution'
In [10]:
donor_data = pd.merge(donor_data, pd.DataFrame(year_of_donation), how='left', left_index=True, right_index=True)
donor_data = pd.merge(donor_data, pd.DataFrame(year_of_contribution), how='left', left_index=True, right_index=True)
In [11]:
# First forward fill the data and then replace with zeros if there are still any nulls lying around
donor_data.year_of_contribution = donor_data.year_of_contribution.fillna(method='ffill')
donor_data.year_of_donation = donor_data.year_of_donation.fillna(method='ffill')
donor_data.year_of_contribution = donor_data.year_of_contribution.fillna(0)
donor_data.year_of_donation = donor_data.year_of_donation.fillna(0)
In [12]:
donor_data[donor_data.donor_id=='_1D50SWTKX']
Out[12]:
In [13]:
donor_data[donor_data['donor_id'] == '-0Q51CZR36']
Out[13]:
In [14]:
def get_repeat_years(years):
'''
input: years of activity for donor
output: list of boolean representing if the year was a repeat year donation
'''
#years = rows.activity_year.unique()
repeat_years = [y for y in years.values if y-1 in years.values]
return years.isin(repeat_years)
In [15]:
donor_data['is_repeat_year'] = donor_data[(donor_data.is_service==False)]\
.groupby('donor_id')['activity_year']\
.apply(get_repeat_years)
In [16]:
!mkdir -p out/42
donor_data.to_pickle('out/42/donors.pkl')
In [17]:
donor_data = pd.read_pickle('out/42/donors.pkl')
donations = df
In [18]:
import locale
In [19]:
color1 = '#67a9cf'
color2 = '#fc8d59'
colors = [color1, color2]
_ = locale.setlocale(locale.LC_ALL, '')
thousands_sep = lambda x: locale.format("%.2f", x, grouping=True)
In [20]:
yearly_donors = donor_data[donor_data.is_service==True]\
.groupby(['year_of_donation', 'donor_id'])\
.amount.sum()\
.to_frame()
yearly_donors.index = yearly_donors.index.droplevel(1)
data = yearly_donors.reset_index().groupby('year_of_donation').amount.median().reset_index()
data.columns = ['year_of_donation', 'amount']
In [21]:
fig, ax = plt.subplots(figsize=(12,6))
plt.bar(data[:-2].year_of_donation, data[:-2].amount, color=color1)
plt.xlabel('Nth year of donation', fontsize=16)
plt.ylabel('Median amount donated (in dollars) in that year', fontsize=16)
ax.xaxis.set_major_formatter(ticker.NullFormatter())
ax.set_ylim([0,400])
plt.tick_params(labelsize=16)
ax.xaxis.set_minor_locator(ticker.FixedLocator(data.index.values+0.5))
ax.xaxis.set_minor_formatter(ticker.FixedFormatter(data.index.values.astype('int')))
ax.tick_params(axis='x', labelsize=16)
#_ = fig.suptitle('Average amount donated vs year of donation', fontsize=16)
plt.savefig('viz/Median_Amount_Donated_In_Nth_Year.png')
In [22]:
data1 = donor_data[(donor_data.is_service==False)].groupby(['activity_year', 'is_repeat_year']).donor_id.nunique().unstack().fillna(0)
data1 = pd.DataFrame(data1.values, columns=['New','Repeat'], index=np.sort(data1.index.unique()))
data1 = data1.apply(lambda x: x/x.sum(), axis=1)
data2 = donor_data[(donor_data.is_service==False)].groupby(['activity_year', 'is_repeat_year']).amount.sum().unstack().fillna(0)
data2 = pd.DataFrame(data2.values, columns=['New','Repeat'], index=np.sort(data2.index.unique()))
data2 = data2.apply(lambda x: x/x.sum(), axis=1)
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10,10))
ax1.bar(data1.index.values, data1.Repeat, color = color2)
ax1.bar(data1.index.values, data1.New, color = color1, bottom=data1.Repeat)
ax1.tick_params(labelsize=16)
ax2.bar(data2.index.values, data2.Repeat, color = color2)
ax2.bar(data2.index.values, data2.New, color = color1, bottom=data2.Repeat)
ax2.tick_params(labelsize=16)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.xticks(np.sort(donor_data.activity_year.unique()))
plt.savefig('viz/NewVsRepeatDonors.png')
In [23]:
x = donations\
.groupby(['activity_year', 'channel']).amount.sum().to_frame().unstack().fillna(0)
x.columns = x.columns.droplevel(0)
x = x/1000000
plot = x.plot(kind='line', colormap=plt.cm.jet,
fontsize=12,
figsize=(12,18),
)
#plt.legend().set_visible(False)
plt.legend(prop={'size':16}, loc='upper center')
#plot.set_title('Donations flowing through different marketing channels',fontsize=16)
plt.xlabel('Year of donation', fontsize=16)
plt.ylabel('Donation amount (in millions of dollars)', fontsize=16)
plt.tick_params(labelsize=16)
for idx, x_value in enumerate(x[x.index==2014].values[0]):
plt.text(2014, x_value, x.columns[idx], fontsize=14)
plt.savefig('viz/DonationsFromDifferentMarketingChannels.png')
In [24]:
x = donations[(donations.activity_year==2014) & (donations.channel=='Other')]\
.groupby(['appeal']).amount.sum().to_frame().unstack().fillna(0).sort_values(ascending=False)[:15].to_frame()
x.index = x.index.droplevel(0)
x.columns = ['Total Donation Amount']
plot = x.plot(kind='bar',
fontsize=12,
color=color1,
figsize=(12,12))
plt.legend().set_visible(False)
plot.set_title('Donations flowing through different marketing channels (2014)',fontsize=16)
plt.xlabel('Donations in 2014', fontsize=14)
plt.savefig('viz/OtherIn2014.png')
In [25]:
cumulative_years = np.cumsum(
df[(df.activity_year > 2010) & (df.is_service==False)]\
.groupby(['activity_year', 'activity_month'])['amount', ]\
.sum()\
.unstack()\
.fillna(0)
, axis=1, dtype='int64').T
cumulative_years.index = cumulative_years.index.droplevel(0)
cumulative_years.index = calendar.month_abbr[1:]
cumulative_years = cumulative_years/1000000
plot = cumulative_years.plot(kind='line',
fontsize=12,
figsize=(12,12))
plt.xlabel('Month of donation', fontsize=16)
plt.ylabel('Cumulative Donation amount (in millions of dollars)', fontsize=16)
plt.tick_params(labelsize=16)
#plot.set_title('Cumulative donation amounts over the years',fontsize=16)
plt.legend(prop={'size':16}, loc='upper center')
vals = cumulative_years.ffill(axis=0)[-1:].columns.values
heights = cumulative_years.ffill(axis=0)[-1:].values[0]
[plt.text(11, height, val, fontsize=14) for (val, height) in zip(vals, heights)]
plt.savefig('viz/CumulativeDonationsOverTheYears.png')
In [26]:
ymdata = df[(df.activity_year > 2010) & (df.is_service==False)].groupby(['activity_year', 'activity_month'])['amount', ]\
.sum()\
.unstack()\
.fillna(0).T
ymdata.index = ymdata.index.droplevel(0)
ymdata.index = calendar.month_abbr[1:13]
ymdata = ymdata/1000000
plot = ymdata.plot(kind='line',
fontsize=12,
figsize=(12,12))
plt.xlabel('Month of donation', fontsize=16)
plt.ylabel('Donation amount (in millions of dollars)', fontsize=16)
#plot.set_title('Monthly donation amounts over the years',fontsize=16)
plt.legend().set_visible(False)
plt.tick_params(labelsize=16)
vals = ymdata.ffill(axis=0)[-1:].columns.values
heights = ymdata.ffill(axis=0)[-1:].values[0]
[plt.text(11, height, val, fontsize=14) for (val, height) in zip(vals, heights)]
plt.savefig('viz/DonationsOverTheYears.png')
In [27]:
donor_data.head()
Out[27]:
In [28]:
def get_churn(year):
return len(set(
donor_data[(donor_data.activity_year==year) & (donor_data.is_service==False)].donor_id.unique())\
.difference(set(donor_data[(donor_data.activity_year>year) & (donor_data.is_service==False)].donor_id.unique())))
In [29]:
churn = pd.Series(
[-get_churn(year)
for year
in np.sort(donor_data.activity_year.unique()[:-1])],
name='Churn',
index=np.sort(donor_data.activity_year.unique()[:-1]))
new_donors = donor_data[donor_data.year_of_donation==1].groupby('activity_year').donor_id.nunique()[:-1]
new_donors.name = 'New'
# We drop the last row since it does not make sense to predict yearly churn until the year has passed
churn = churn.drop(churn.tail(1).index)
new_donors = new_donors.drop(new_donors.tail(1).index)
In [30]:
x = churn.index.values
fig = plt.figure(figsize=(10,10))
#plt.title('Churn vs New donors for every year', fontsize=16)
ax = plt.subplot(111)
ax.bar(x, new_donors, width=0.5, color=color1, align='center', label='New donors')
ax.bar(x, churn, width=0.5, color=color2, align='center', label='Donors churned')
plt.legend(prop={'size':16}, loc=(1,0.33))
plt.tick_params(labelsize=16)
plt.xlabel('Year of donation', fontsize=16)
plt.ylabel('Number of donors', fontsize=16)
plt.savefig('viz/ChurnVsNewDonors.png', bbox_inches='tight')
In [31]:
from itertools import cycle
def plot_event_donation_activity(state, years):
ymdata = np.cumsum(
donations[(donations.state==state)].groupby(['activity_year','activity_month'])['amount', ]\
.sum()\
.unstack()\
.fillna(0),
axis=1, dtype='int64')
state_events = events[(events.state==state)][['event_name', 'amount', 'activity_month', 'activity_year']]\
.sort_values(by=['activity_year', 'activity_month']).reset_index(drop=True)
ymdata.columns = ymdata.columns.droplevel(0)
fig, ax1 = plt.subplots()
ax1.set_xlabel('Month')
ax1.set_ylabel('Donation amount')
vals = ymdata.index.values
heights = ymdata.ffill(axis=1)[-1:].values[0]
#[plt.text(12, height, val, fontsize=14) for (val, height) in zip(vals, heights)]
ax2 = ax1.twinx()
ax2.set_ylabel('Event contributions')
colors = cycle(["r", "b", "g"])
for year in years:
color = next(colors)
s1 = ymdata[ymdata.index==year].values[0]
t = range(1,13)
ax1.plot(t, s1, color=color, label=year)
evs = state_events[state_events.activity_year==year]
for ev in evs.iterrows():
bar = ax2.bar(ev[1].activity_month, ev[1].amount, width=-0.4, alpha=0.2, color=color)
label = ev[1].event_name
# Put event_name on top of the bars
rect = bar.patches[0]
height = rect.get_height()
ax2.text(rect.get_x() + rect.get_width()/2,
height + 5,
label,
ha='center',
va='bottom',
rotation='vertical',
fontsize=12)
ax1.legend(prop={'size':16}, loc='upper left')
plt.savefig('viz/Events_vs_Donations_{0}.png'.format(state))
return ymdata
In [32]:
ymdata = plot_event_donation_activity('WA', [2011])
In [33]:
ymdata.ffill(axis=1)[:-1].index.values
ymdata.ffill(axis=1)[-1:].values[0]
Out[33]:
In [34]:
cumulative_years = np.cumsum(
donations[(donations.activity_year > 2010)]\
.groupby(['activity_year', 'activity_month'])['amount', ]\
.sum()\
.unstack()\
.fillna(0)
, axis=1, dtype='int64').T
In [35]:
cumulative_years
Out[35]: