This notebook is meant for cleaning up the donation data.
The following is a summary of some of the cleanup tasks from this notebook:
In [1]:
import pandas as pd
import numpy as np
import locale
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
%matplotlib inline
from bokeh.plotting import output_notebook
output_notebook()
In [2]:
_ = locale.setlocale(locale.LC_ALL, '')
thousands_sep = lambda x: locale.format("%.2f", x, grouping=True)
#example:
print thousands_sep(1234567890.76543)
getdate_ym = lambda x: str(x.year) + "_" + str(x.month)
getdate_ymd = lambda x: str(x.month) + "/" + str(x.day) + "/" + str(x.year)
dates = pd.DatetimeIndex(['2010-10-17', '2011-05-13', "2012-01-15"])
map(getdate_ym, dates)
map(getdate_ymd, dates)
Out[2]:
In [3]:
df = pd.read_csv('in/gifts_Feb2016_2.csv')
source_columns = ['donor_id', 'amount_initial', 'donation_date', 'appeal', 'fund', 'city', 'state', 'zipcode_initial', 'charitable', 'sales']
df.columns = source_columns
In [4]:
df.info()
In [5]:
strip_func = lambda x: x.strip() if isinstance(x, str) else x
df = df.applymap(strip_func)
In [6]:
df.replace({'appeal': {'0': ''}}, inplace=True)
df.appeal.fillna('', inplace=True)
df.fund.fillna('', inplace=True)
In [7]:
df.donation_date = pd.to_datetime(df.donation_date)
df.charitable = df.charitable.astype('bool')
df['zipcode'] = df.zipcode_initial.str[0:5]
In [8]:
fill_zipcode = lambda x: '0'*(5-len(str(x))) + str(x)
x1 = pd.DataFrame([[1, '8820'], [2, 8820]], columns=['a','b'])
x1.b = x1.b.apply(fill_zipcode)
x1
Out[8]:
In [9]:
df.zipcode = df.zipcode.apply(fill_zipcode)
In [10]:
## Ensure that all amounts are dollar figures
df[~df.amount_initial.str.startswith('-$') & ~df.amount_initial.str.startswith('$')]
Out[10]:
In [11]:
## drop row with invalid data
df.drop(df[df.donation_date == '1899-12-31'].index, axis=0, inplace=True)
In [12]:
df['amount_cleanup'] = df.amount_initial.str.replace(',', '')
df['amount_cleanup'] = df.amount_cleanup.str.replace('$', '')
df['amount'] = df.amount_cleanup.astype(float)
In [13]:
## Make sure we did not throw away valid numbers by checking with the original value
df[(df.amount == 0)].amount_initial.unique()
Out[13]:
In [14]:
# There are some outliers in the data, quite a few of them are recent.
_ = plt.scatter(df[df.amount > 5000].amount.values, df[df.amount > 5000].donation_date.values)
plt.show()
In [15]:
# Fun little thing to try out bokeh (we can hover and detect the culprits)
def plot_data(df):
dates = map(getdate_ym, pd.DatetimeIndex(df[df.amount > 5000].donation_date))
amounts = map(thousands_sep, df[df.amount > 5000].amount)
x = df[df.amount > 5000].donation_date.values
y = df[df.amount > 5000].amount.values
donor_ids = df[df.amount > 5000].donor_id.values
states = df[df.amount > 5000].state.values
source = ColumnDataSource(
data=dict(
x=x,
y=y,
dates=dates,
amounts=amounts,
donor_ids=donor_ids,
states=states,
)
)
hover = HoverTool(
tooltips=[
("date", "@dates"),
("amount", "@amounts"),
("donor", "@donor_ids"),
("states", "@states"),
]
)
p = figure(plot_width=400, plot_height=400, title=None, tools=[hover])
p.circle('x', 'y', size=5, source=source)
show(p)
In [16]:
plot_data(df.query('amount > 5000'))
In [17]:
# All the Outliers seem to have the following properties: state == YY and specific donorid.
# Plot the remaining data outside of these to check that we caught all the outliers.
plot_data(df[~df.index.isin(df.query('state == "YY" and amount > 5000').index)])
In [18]:
# Outlier data
df[(df.state == 'YY') & (df.amount >= 45000)]
Out[18]:
In [19]:
df[(df.state == 'YY') & (df.amount >= 45000)]\
.sort_values(by='amount', ascending=False)\
.head(6)[source_columns]\
.to_csv('out/0/outlier_data.csv')
In [20]:
df.drop(df[(df.state == 'YY') & (df.amount >= 45000)].index, inplace=True)
In [21]:
print 'After dropping the anonymous donor, total amounts from the unknown state as a percentage of all amounts is: '\
, thousands_sep(100*df[(df.state == 'YY')].amount.sum()/df.amount.sum()), '%'
In [22]:
## Some funds have zero amounts associated with them.
## They mostly look like costs - expense fees, transaction fees, administrative fees
## Let us examine if we can safely drop them from our analysis
df[df.amount_initial == '$0.00'].groupby(['fund', 'appeal'])['donor_id'].count()
Out[22]:
In [23]:
df.drop(df[df.amount == 0].index, axis=0, inplace=True)
In [24]:
## What is the total amount of the negative?
print 'Total negative amount is: ', df[df.amount < 0].amount.sum()
In [25]:
# Add if condition to make this re-runnable
if df[df.amount < 0].amount.sum() > 0:
print 'Amounts grouped by fund and appeal, sorted by most negative amounts'
df[df.amount < 0]\
.groupby(['fund', 'appeal'])['amount',]\
.sum()\
.sort_values(by='amount')\
.to_csv('out/0/negative_amounts_sorted.csv')
df[df.amount < 0]\
.groupby(['fund', 'appeal'])['amount',]\
.sum()\
.to_csv('out/0/negative_amounts_grouped_by_fund.csv')
In [26]:
df.drop(df[df.amount < 0].index, axis=0, inplace=True)
In [27]:
df.info()
In [28]:
df.state.unique()
Out[28]:
In [29]:
## States imported from http://statetable.com/
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state'}, inplace=True)
In [30]:
all_states = pd.merge(states, pd.DataFrame(df.state.unique(), columns=['state']), on='state', how='right')
invalid_states = all_states[pd.isnull(all_states.id)].state
In [31]:
df[df.state.isin(invalid_states)].state.value_counts().sort_index()
Out[31]:
In [32]:
df[df.state.isin(['56', 'AB', 'BC', 'CF', 'Ca', 'Co', 'HY', 'IO', 'Ny', 'PR', 'UK', 'VI', 'ja'])]
Out[32]:
In [33]:
%%html
<style>table {float:left}</style>
State | Count | Action | Explanation |
---|---|---|---|
YY | 268 | None | All these rows are bogus entries (City and Zip are also YYYYs) - about 20% of the donation amount has this |
ON | 62 | Remove | This is the state of Ontario, Canada |
AP | 18 | Remove | This is data for Hyderabad |
VI | 6 | Remove | Virgin Islands |
PR | 5 | Remove | Peurto Rico |
Ny | 5 | NY | Same as NY - rename Ny as NY |
56 | 1 | Remove | This is one donation from Bangalore, Karnataka |
HY | 1 | Remove | Hyderabad |
BC | 1 | Remove | British Columbia, Canada |
IO | 1 | IA | Changed to Iowa - based on city and zip code |
AB | 1 | Remove | AB stands for Alberta, Canada |
Ca | 1 | CA | Same as California - rename Ca to CA |
Co | 1 | CO | Same as Colarado - rename Co to CO |
CF | 1 | FL | Changed to Florida based on zip code and city |
ja | 1 | FL | Change to FL based on zip code and city |
UK | 1 | Remove | London, UK |
KA | 1 | Remove | Bangalore, Karnataka |
In [34]:
state_renames = {'Ny': 'NY', 'IO': 'IA', 'Ca' : 'CA', 'Co' : 'CO', 'CF' : 'FL', 'ja' : 'FL'}
df.replace({'state': state_renames}, inplace=True)
In [35]:
non_usa_states = ['ON', 'AP', 'VI', 'PR', '56', 'HY', 'BC', 'AB', 'UK', 'KA']
print 'Total amount for locations outside USA: ', sum(df[df.state.isin(non_usa_states)].amount)
#### Total amount for locations outside USA: 30710.63
In [36]:
df.drop(df[df.state.isin(non_usa_states)].index, axis=0, inplace=True)
In [37]:
print 'Percentage of amount for unknown (YY) state : {:.2f}'.format(100*df[df.state == 'YY'].amount.sum()/df.amount.sum())
In [38]:
print 'Total amount for the unknown state excluding outliers: ', df[(df.state == 'YY') & (df.amount < 45000)].amount.sum()
print 'Total amount for the unknown state: ', df[(df.state == 'YY')].amount.sum()
print 'Total amount: ', df.amount.sum()
Pecentage of total amount from donations with no location: 3.087
We will investigate the data that does have location information for correctness of location and then merge the no location data back at the end.
In [39]:
print 'Pecentage of total amount from donations with no location: ', 100*sum(df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].amount)/sum(df.amount)
In [40]:
noloc_df = df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].copy()
df = df[~((df.city == '') & (df.state == '') & (df.zipcode_initial == ''))].copy()
In [41]:
print df.shape[0] + noloc_df.shape[0]
In [42]:
noloc_df = noloc_df.append(df[(df.state == 'YY')])
df = df[~(df.state == 'YY')]
In [43]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]
In [44]:
noloc_df = noloc_df.append(df[(df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy')])
df = df[~((df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy'))]
In [45]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]
In [46]:
print 'Percentage of total amount for data with City but no state: {:.3f}'.format(100*sum(df[df.state == ''].amount)/sum(df.amount))
df[((df.state == '') & (df.city != ''))][['city','zipcode','amount']].sort_values('city', ascending=True).to_csv('out/0/City_No_State.csv')
By visually examining the cities for rows that don't have a state, we can see that all the cities are coming from Canada and India and some from other countries (except two entries). So we will correct these two entries and drop all the other rows as they are not relevant to the USA.
In [47]:
index = df[(df.donor_id == '-28K0T47RF') & (df.donation_date == '2007-11-30') & (df.city == 'Cupertino')].index
df.ix[index,'state'] = 'CA'
index = df[(df.donor_id == '9F4812A118') & (df.donation_date == '2012-06-30') & (df.city == 'San Juan')].index
df.ix[index,'state'] = 'WA'
df.ix[index,'zipcode'] = 98250
In [48]:
# Verified that these remaining entries are for non-US location
print 'Total amount for non-USA location: ', df[((df.state == '') & (df.city != ''))].amount.sum()
In [49]:
df.drop(df[((df.state == '') & (df.city != ''))].index, inplace=True)
Percentage of total amount for data with valid US state, but no city, zipcode: 4.509
Most of this amount (1.7 of 1.8 million) is coming from about 600 donors in California. We already know that about California is a major contributor to donations.
Although, we can do some analytics based on just the US state using this data, it complicates the analysis that does not substantiate the knowledge gain.
Therefore, we are dropping the state column from these rows and moving over this data to the dataset that has no location (the one that we created earlier) to simplify our analysis.
In [50]:
print 'Percentage of total amount for data with valid US state, but no city, zipcode: {:.3f}'.format(100*sum(df[(df.city == '') & (df.zipcode_initial == '')].amount)/sum(df.amount))
In [51]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]
In [52]:
stateonly_df = df[(df.city == '') & (df.zipcode_initial == '')].copy()
stateonly_df.state = ''
## Move the rows with just the state over to the noloc_df dataset
noloc_df = pd.concat([noloc_df, stateonly_df])
df = df[~((df.city == '') & (df.zipcode_initial == ''))].copy()
In [53]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]
In [54]:
print 100*sum(df[df.city == ''].amount)/sum(df.amount)
print len(df[df.city == '']), len(df[df.zipcode_initial == ''])
print sum(df[df.city == ''].amount), sum(df[df.zipcode_initial == ''].amount)
print sum(df[(df.city == '') & (df.zipcode_initial != '')].amount),\
sum(df[(df.city != '') & (df.zipcode_initial == '')].amount)
print sum(df.amount)
In [55]:
## Zip codes from ftp://ftp.census.gov/econ2013/CBP_CSV/zbp13totals.zip
zipcodes = pd.read_csv('in/zbp13totals.txt', dtype={'zip': object})
zipcodes = zipcodes[['zip', 'city', 'stabbr']]
zipcodes = zipcodes.rename(columns = {'zip':'zipcode', 'stabbr': 'state', 'city': 'city'})
zipcodes.city = zipcodes.city.str.title()
zipcodes.zipcode = zipcodes.zipcode.astype('str')
In [56]:
## If we know the zip code, we can populate the city by using the zipcodes data
df.replace({'city': {'': np.nan}, 'state': {'': np.nan}}, inplace=True)
## Set the index correctly for update to work. Then reset it back.
df.set_index(['zipcode'], inplace=True)
zipcodes.set_index(['zipcode'], inplace=True)
df.update(zipcodes, join='left', overwrite=False, raise_conflict=False)
In [57]:
df.reset_index(drop=False, inplace=True)
zipcodes.reset_index(drop=False, inplace=True)
In [58]:
zipcodesdetail = pd.read_csv('in/zip_code_database.csv')
In [59]:
zipcodesdetail = zipcodesdetail[zipcodesdetail.country == 'US'][['zip', 'primary_city', 'county', 'state', 'timezone', 'latitude', 'longitude']]
zipcodesdetail = zipcodesdetail.rename(columns = {'zip':'zipcode', 'primary_city': 'city'})
In [60]:
# The zip codes dataset has quite a few missing values. Filling in what we need for now.
# If this happens again, search for a different data source!!
zipcodesdetail.loc[(zipcodesdetail.city == 'Frisco') & (zipcodesdetail.state == 'TX') & (pd.isnull(zipcodesdetail.county)), 'county'] = 'Denton'
In [61]:
# Strip the ' County' portion from the county names
def getcounty(county):
if pd.isnull(county):
return county
elif county.endswith(' County'):
return county[:-7]
else:
return county
zipcodesdetail.county = zipcodesdetail['county'].apply(getcounty)
In [62]:
zipcodesdetail.zipcode = zipcodesdetail.zipcode.apply(fill_zipcode)
In [63]:
newcols = np.array(list(set(df.columns).union(zipcodesdetail.columns)))
In [64]:
df = pd.merge(df, zipcodesdetail, on=['state', 'city', 'zipcode'], how='inner', suffixes=('_x', ''))[newcols]
In [65]:
# For some reason, the data types are being reset. So setting them back to their expected data types.
df.donation_date = df.donation_date.apply(pd.to_datetime)
df.charitable = df.charitable.apply(bool)
df.amount = df.amount.apply(int)
In [66]:
all_zipcodes = pd.merge(df, zipcodes, on='zipcode', how='left')
all_zipcodes[pd.isnull(all_zipcodes.city_x)].head()
Out[66]:
In [67]:
## There seems to be only one row with an invalid zip code. Let's drop it.
df.drop(df[df.zipcode_initial.isin(['GU214ND','94000'])].index, axis=0, inplace=True)
In [68]:
print 'No state: count of rows: ', len(df[df.state == ''].amount),\
'Total amount: ', sum(df[df.state == ''].amount)
print 'No zipcode: count of rows: ', len(df[df.zipcode == ''].amount),\
'Total amount: ', sum(df[df.zipcode == ''].amount)
print 'No city: count of rows: ', len(df[df.city == ''].amount),\
'Total amount: ', sum(df[df.city == ''].amount)
In [69]:
# Examining data - top 10 states by amount and number of donors
print df.groupby('state')['amount',].sum().sort_values(by='amount', ascending=False)[0:10]
print df.groupby('state')['donor_id',].count().sort_values(by='donor_id', ascending=False)[0:10]
In [70]:
print noloc_df.state.unique()
print noloc_df.city.unique()
print noloc_df.zipcode.unique()
In [71]:
noloc_df['city'] = ''
noloc_df['state'] = ''
noloc_df['zipcode'] = ''
In [72]:
print df.shape[0] + noloc_df.shape[0]
In [73]:
df.shape, noloc_df.shape
Out[73]:
In [74]:
# The input data has the latest zip code for each donor. So we cannot observe any movement even if there was any since
# all donations by a given donor will only have the same exact zipcode.
x1 = pd.DataFrame(df.groupby(['donor_id','zipcode']).zipcode.nunique())
x1[x1.zipcode != 1]
Out[74]:
In [75]:
# The noloc_df and the df with location values have no donors in common - so we cannot use the donor
# location information from df to detect the location in noloc_df.
set(df.donor_id.values).intersection(noloc_df.donor_id.values)
Out[75]:
In [76]:
df.rename(columns={'donation_date': 'activity_date'}, inplace=True)
df['activity_year'] = df.activity_date.apply(lambda x: x.year)
df['activity_month'] = df.activity_date.apply(lambda x: x.month)
df['activity_dow'] = df.activity_date.apply(lambda x: x.dayofweek)
df['activity_ym'] = df['activity_date'].map(lambda x: 100*x.year + x.month)
df['activity_yq'] = df['activity_date'].map(lambda x: 10*x.year + (x.month-1)//3)
df['activity_ymd'] = df['activity_date'].map(lambda x: 10000*x.year + 100*x.month + x.day)
In [77]:
# Drop the zipcode_initial (for privacy reasons)
df.drop('zipcode_initial', axis=1, inplace=True)
In [78]:
!mkdir -p out/0
df.to_pickle('out/0/donations.pkl')
noloc_df.to_pickle('out/0/donations_noloc.pkl')
In [79]:
df[df.donor_id == '_1D50SWTKX'].sort_values(by='activity_date').tail()
Out[79]:
In [80]:
df.columns
Out[80]:
In [81]:
df.shape
Out[81]: