In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_pickle('out/21/donations.pkl')
In [3]:
df.columns
Out[3]:
In [4]:
df[(df.is_service==True)].appeal.nunique()
Out[4]:
In [5]:
# We will only consider services for this step, so limit what we need
dfs = df[df.is_service==True]
del df
In [6]:
events_by_attendance = dfs\
.groupby(['appeal', 'campaign_location_id', 'campaign_month_id'])\
.size()\
.to_frame()\
.reset_index()\
.rename(columns={0: 'transaction_count'})
In [7]:
events_by_amount = dfs\
.groupby(['appeal', 'campaign_location_id', 'campaign_month_id', ])\
.amount\
.sum()\
.to_frame()\
.reset_index()\
.rename(columns={0: 'amount'})
In [8]:
events_by_attendance[events_by_attendance.transaction_count > 10].transaction_count.plot(kind='hist', bins=30)
Out[8]:
In [9]:
events_by_amount.amount[events_by_amount.amount < 10000].plot(kind='hist', bins=50)
Out[9]:
In [10]:
dfs[(dfs.appeal=='Event_TollywoodThriller')].shape
Out[10]:
In [11]:
dfs[(dfs.appeal=='Event_TollywoodThriller')]\
.groupby(['appeal', 'campaign_location_id', 'campaign_month_id']).size()
Out[11]:
In [12]:
events_by_attendance[events_by_attendance.appeal=='Event_TollywoodThriller']
Out[12]:
In [13]:
dfs[(dfs.appeal=='Event_TollywoodThriller')]\
.groupby(['appeal', 'campaign_location_id', 'campaign_month_id']).amount.sum()
Out[13]:
In [14]:
events_by_amount[events_by_amount.appeal=='Event_TollywoodThriller']
Out[14]:
In [15]:
events_by_attendance[events_by_attendance.transaction_count >= 10].shape
Out[15]:
In [16]:
events_by_attendance[events_by_attendance.transaction_count < 10].transaction_count.sum()
Out[16]:
In [17]:
## Trim the event attendance data to atleast 20 transactions and merge with the other dataframe to get the amounts
events = events_by_attendance[events_by_attendance.transaction_count >= 10]\
.merge(events_by_amount, how='left', on=['appeal','campaign_location_id','campaign_month_id'])
In [18]:
# Merge with the original dataframe to get the month centers
events = events.merge(
dfs[(dfs.is_month_center==True)]\
[['appeal', 'campaign_month_id', 'activity_year', 'activity_month', 'activity_ym']],
how='left', on=['appeal', 'campaign_month_id']).drop_duplicates()
# Merge with the original dataframe to get the location centers
events = events.merge(
dfs[(dfs.is_location_center==True)]\
[['appeal', 'campaign_location_id', 'county', 'state']],
how='left', on=['appeal', 'campaign_location_id']).drop_duplicates()
In [19]:
# Sanity check what we got thus far
events[events.appeal == 'Event_TollywoodThriller']
Out[19]:
In [20]:
dfs[(dfs.is_location_center==True) & (dfs.appeal=='Event_TollywoodThriller')]\
[['county', 'state']].drop_duplicates()
Out[20]:
In [21]:
dfs[(dfs.is_month_center==True) & (dfs.appeal=='Event_TollywoodThriller')]\
[['activity_year', 'activity_month']].drop_duplicates()
Out[21]:
We had events in a total of 13 different states.
California had the highest number of events (33). Washington has the next highest number of events (11).
The events were more or less distributed throughout the year.
March 2014 was by far the busiest year/month for the organization (8 events).
2014 was the busiest year in terms of number of events (22).
Other than that, in general events are spread out quite evenly over the year across the board.
In [22]:
events.groupby('state').size().plot(kind='barh')
Out[22]:
In [23]:
events.groupby('activity_month').size().plot(kind='barh')
Out[23]:
In [24]:
events.groupby('activity_month').amount.sum().plot(kind='barh')
Out[24]:
In [25]:
events.groupby('activity_year').size().plot(kind='barh')
Out[25]:
In [26]:
events.groupby(['activity_year']).agg({'amount': sum}).plot(kind='bar', legend=True)
Out[26]:
In [27]:
events['event_name'] = \
events.apply(
lambda row:
row.appeal + '_' +
str(int(row.campaign_location_id)) + '_' +
str(int(row.campaign_month_id)),
axis=1)
In [28]:
!mkdir -p out/41
events\
.reset_index()\
.drop('index', axis=1)\
.to_pickle('out/41/events.pkl')
In [ ]: