Data cleanup

This notebook is meant for cleaning up the donation data.

The following is a summary of some of the cleanup tasks from this notebook:

  1. Load the csv file that has the donors
  2. Strip out whitespace from all the columns
  3. Fill na with empty strings
  4. Change column data types (after examining for correctness)
  5. Cleanup amounts column - removed negative (totals to -641910.46 dollars) and zero values
  6. Cleanup state codes.
  7. Removed donations that are outside of US - about \$30,000 USD
  8. Removed donations totaling to 9.5 million dollars that came from anonymous donors (as outliers)
  9. If there is no location information or it is inaccurate, move it to a different, move it to a different dataframe
  10. Update the city and state names when not present based on the zipcodes dataset.

In [1]:
import pandas as pd
import numpy as np
import locale
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

%matplotlib inline
from bokeh.plotting import output_notebook
output_notebook()


BokehJS successfully loaded.

In [3]:
_ = locale.setlocale(locale.LC_ALL, '')
thousands_sep = lambda x: locale.format("%.2f", x, grouping=True)
#example:
print thousands_sep(1234567890.76543)

getdate_ym = lambda x: str(x.year) + "_" + str(x.month)
getdate_ymd = lambda x: str(x.month) + "/" + str(x.day) + "/" + str(x.year)
dates = pd.DatetimeIndex(['2010-10-17', '2011-05-13', "2012-01-15"])
map(getdate_ym, dates)
map(getdate_ymd, dates)


1,234,567,890.77
Out[3]:
['10/17/2010', '5/13/2011', '1/15/2012']

Load csv


In [4]:
#df = pd.read_csv('sef_donors_sample.csv', skipinitialspace=True, sep=',\s*', engine='python', quotechar='"')
df = pd.read_csv('in/sef_donors_edit.csv')
source_columns = ['donor_id', 'amount_initial', 'donation_date', 'appeal', 'fund', 'city', 'state', 'zipcode_initial', 'batch_num', 'charitable', 'sales']
df.columns = source_columns


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-bee809a70948> in <module>()
      1 #df = pd.read_csv('sef_donors_sample.csv', skipinitialspace=True, sep=',\s*', engine='python', quotechar='"')
----> 2 df = pd.read_csv('in/sef_donors_edit.csv')
      3 source_columns = ['donor_id', 'amount_initial', 'donation_date', 'appeal', 'fund', 'city', 'state', 'zipcode_initial', 'batch_num', 'charitable', 'sales']
      4 df.columns = source_columns

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File in/sef_donors_edit.csv does not exist

In [ ]:
df.info()

In [ ]:
strip_func = lambda x: x.strip() if isinstance(x, str) else x
df = df.applymap(strip_func)

Address nan column values


In [ ]:
df.replace({'appeal': {'0': ''}}, inplace=True)
df.appeal.fillna('', inplace=True)
df.fund.fillna('', inplace=True)

Change column types and drop unused columns


In [ ]:
df.donation_date = pd.to_datetime(df.donation_date)
df.batch_num = df.batch_num.astype('str')
df.charitable = df.charitable.astype('bool')
df['zipcode'] = df.zipcode_initial.str[0:5]

In [ ]:
df['zipcode'] = df.zipcode_initial.str[0:5]

In [ ]:
fill_zipcode = lambda x: '0'*(5-len(str(x))) + str(x)
x1 = pd.DataFrame([[1, '8820'], [2, 8820]], columns=['a','b'])
x1.b = x1.b.apply(fill_zipcode)
x1

In [ ]:
df.zipcode = df.zipcode.apply(fill_zipcode)

Cleanup amounts


In [ ]:
## Ensure that all amounts are dollar figures
df[~df.amount_initial.str.startswith('-$') & ~df.amount_initial.str.startswith('$')]

In [ ]:
df['amount_cleanup'] = df.amount_initial.str.replace(',', '')
df['amount_cleanup'] = df.amount_cleanup.str.replace('$', '')
df['amount'] = df.amount_cleanup.astype(float)

In [ ]:
## Make sure we did not throw away valid numbers by checking with the original value
df[(df.amount == 0)].amount_initial.unique()

In [ ]:
## drop row with invalid data
df.drop(df[df.donation_date == '1900-01-01'].index, axis=0, inplace=True)

Outlier data


In [ ]:
# There are some outliers in the data, quite a few of them are recent.
_ = plt.scatter(df[df.amount > 5000].amount.values, df[df.amount > 5000].donation_date.values)
plt.show()

In [ ]:
# Fun little thing to try out bokeh (we can hover and detect the culprits)
def plot_data(df):
    dates = map(getdate_ym, pd.DatetimeIndex(df[df.amount > 5000].donation_date))
    amounts = map(thousands_sep, df[df.amount > 5000].amount)
    x = df[df.amount > 5000].donation_date.values
    y = df[df.amount > 5000].amount.values
    donor_ids = df[df.amount > 5000].donor_id.values
    states = df[df.amount > 5000].state.values

    source = ColumnDataSource(
            data=dict(
                x=x,
                y=y,
                dates=dates,
                amounts=amounts,
                donor_ids=donor_ids,
                states=states,
            )
        )

    hover = HoverTool(
            tooltips=[
                ("date", "@dates"),
                ("amount", "@amounts"),
                ("donor", "@donor_ids"),
                ("states", "@states"),
            ]
        )

    p = figure(plot_width=400, plot_height=400, title=None, tools=[hover])
    p.circle('x', 'y', size=5, source=source)

    show(p)

In [ ]:
plot_data(df.query('amount > 5000'))

In [ ]:
# All the Outliers seem to have the following properties: state == YY and specific donorid.
# Plot the remaining data outside of these to check that we caught all the outliers.
plot_data(df[~df.index.isin(df.query('state == "YY" and amount > 5000').index)])

In [ ]:
# Outlier data
df[(df.state == 'YY') & (df.amount >= 45000)]

In [ ]:
df[(df.state == 'YY') & (df.amount >= 45000)]\
    .sort_values(by='amount', ascending=False)\
    .head(6)[source_columns]\
    .to_csv('out/0/outlier_data.csv')

Exchanged emails with Anil and confirmed the decision to drop the outlier for the anonymous donor with the 9.5 million dollars.


In [ ]:
df.drop(df[(df.state == 'YY') & (df.amount >= 45000)].index, inplace=True)

In [ ]:
print 'After dropping the anonymous donor, total amounts from the unknown state as a percentage of all amounts is: '\
    , thousands_sep(100*df[(df.state == 'YY')].amount.sum()/df.amount.sum()), '%'

Amounts with zero values


In [ ]:
## Some funds have zero amounts associated with them.
## They mostly look like costs - expense fees, transaction fees, administrative fees
## Let us examine if we can safely drop them from our analysis

df[df.amount_initial == '$0.00'].groupby(['fund', 'appeal'])['donor_id'].count()

Dropping rows with zero amounts (after confirmation with SEF office)


In [ ]:
df.drop(df[df.amount == 0].index, axis=0, inplace=True)

Negative amounts


In [ ]:
## What is the total amount of the negative? (-640910.46 dollars)
print 'Total negative amount is: ', df[df.amount < 0].amount.sum()

In [ ]:
# Add if condition to make this re-runnable
if df[df.amount < 0].amount.sum() > 0:
    print 'Amounts grouped by fund and appeal, sorted by most negative amounts'
    df[df.amount < 0]\
        .groupby(['fund', 'appeal'])['amount',]\
        .sum()\
        .sort_values(by='amount')\
        .to_csv('out/0/negative_amounts_sorted.csv')

    df[df.amount < 0]\
        .groupby(['fund', 'appeal'])['amount',]\
        .sum()\
        .to_csv('out/0/negative_amounts_grouped_by_fund.csv')

Dropping rows with negative amounts (after confirmation with SEF office)


In [ ]:
df.drop(df[df.amount < 0].index, axis=0, inplace=True)

Investigate invalid state codes


In [ ]:
df.info()

In [ ]:
df.state.unique()

In [ ]:
## States imported from http://statetable.com/
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state'}, inplace=True)

In [ ]:
all_states = pd.merge(states, pd.DataFrame(df.state.unique(), columns=['state']), on='state', how='right')
invalid_states = all_states[pd.isnull(all_states.id)].state

In [ ]:
df[df.state.isin(invalid_states)].state.value_counts()

In [ ]:
df[df.state.isin(['VI', 'PR', 'Ny' 'HY', 'BC','IO', 'AB', 'Ca', 'Co', 'CF', 'ja', 'UK', 'KA'])]

In [ ]:
%%html
<style>table {float:left}</style>

Explanation for invalid state codes:

State Count Action Explanation
YY 268 None All these rows are bogus entries (City and Zip are also YYYYs) - about 20% of the donation amount has this
ON 62 Remove This is the state of Ontario, Canada
AP 18 Remove This is data for Hyderabad
VI 6 Remove Virgin Islands
PR 5 Remove Peurto Rico
Ny 5 NY Same as NY - rename Ny as NY
56 1 Remove This is one donation from Bangalore, Karnataka
HY 1 Remove Hyderabad
BC 1 Remove British Columbia, Canada
IO 1 IA Changed to Iowa - based on city and zip code
AB 1 Remove AB stands for Alberta, Canada
Ca 1 CA Same as California - rename Ca to CA
Co 1 CO Same as Colarado - rename Co to CO
CF 1 FL Changed to Florida based on zip code and city
ja 1 FL Change to FL based on zip code and city
UK 1 Remove London, UK
KA 1 Remove Bangalore, Karnataka

In [ ]:
state_renames = {'Ny': 'NY', 'IO': 'IA', 'Ca' : 'CA', 'Co' : 'CO', 'CF' : 'FL', 'ja' : 'FL'}
df.replace({'state': state_renames}, inplace=True)

Dropping data for non-US locations


In [ ]:
non_usa_states = ['ON', 'AP', 'VI', 'PR', '56', 'HY', 'BC', 'AB', 'UK', 'KA']
print 'Total amount for locations outside USA: ', sum(df[df.state.isin(non_usa_states)].amount)
#### Total amount for locations outside USA:  30576.34

In [ ]:
df.drop(df[df.state.isin(non_usa_states)].index, axis=0, inplace=True)

Investigate donations with state of YY


In [ ]:
print 'Percentage of amount for unknown (YY) state : {:.2f}'.format(100*df[df.state == 'YY'].amount.sum()/df.amount.sum())

In [ ]:
print 'Total amount for the unknown state excluding outliers: ', df[(df.state == 'YY') & (df.amount < 45000)].amount.sum()
print 'Total amount for the unknown state: ', df[(df.state == 'YY')].amount.sum()
print 'Total amount: ', df.amount.sum()

We will add these donations to the noloc_df below (which is the donations that have empty strings for the city/state/zipcode.

Investigate empty city, state and zip code

Pecentage of total amount from donations with no location: 3.087

Moving all the data with no location to a different dataframe.

We will investigate the data that does have location information for correctness of location and then merge the no location data back at the end.


In [ ]:
print 'Pecentage of total amount from donations with no location: ', 100*sum(df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].amount)/sum(df.amount)

In [ ]:
noloc_df = df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].copy()
df = df[~((df.city == '') & (df.state == '') & (df.zipcode_initial == ''))].copy()

In [ ]:
print df.shape[0] + noloc_df.shape[0]

In [ ]:
noloc_df = noloc_df.append(df[(df.state == 'YY')])
df = df[~(df.state == 'YY')]

In [ ]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

Investigate City in ('YYY','yyy')

These entries have invalid location information and will be added to the noloc_df dataframe.


In [ ]:
noloc_df = noloc_df.append(df[(df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy')])
df = df[~((df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy'))]

In [ ]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

Investigate empty state but non-empty city

Percentage of total amount for data with City but no state: 0.486


In [ ]:
print 'Percentage of total amount for data with City but no state: {:.3f}'.format(100*sum(df[df.state == ''].amount)/sum(df.amount))
df[((df.state == '') & (df.city != ''))][['city','zipcode','amount']].sort_values('city', ascending=True).to_csv('out/0/City_No_State.csv')

By visually examining the cities for rows that don't have a state, we can see that all the cities are coming from Canada and India and some from other countries (except two entries). So we will correct these two entries and drop all the other rows as they are not relevant to the USA.


In [ ]:
index = df[(df.donor_id == '-28K0T47RF') & (df.donation_date == '2007-11-30') & (df.city == 'Cupertino')].index
df.ix[index,'state'] = 'CA'
index = df[(df.donor_id == '9F4812A118') & (df.donation_date == '2012-06-30') & (df.city == 'San Juan')].index
df.ix[index,'state'] = 'WA'
df.ix[index,'zipcode'] = 98250

In [ ]:
# Verified that these remaining entries are for non-US location
print 'Total amount for non-USA location: ', df[((df.state == '') & (df.city != ''))].amount.sum()

In [ ]:
df.drop(df[((df.state == '') & (df.city != ''))].index, inplace=True)

Investigate empty city and zipcode but valid US state

Percentage of total amount for data with valid US state, but no city, zipcode: 4.509

Most of this amount (1.7 of 1.8 million) is coming from about 600 donors in California. We already know that about California is a major contributor to donations.

Although, we can do some analytics based on just the US state using this data, it complicates the analysis that does not substantiate the knowledge gain.

Therefore, we are dropping the state column from these rows and moving over this data to the dataset that has no location (the one that we created earlier) to simplify our analysis.


In [ ]:
print 'Percentage of total amount for data with valid US state, but no city, zipcode: {:.3f}'.format(100*sum(df[(df.city == '') & (df.zipcode_initial == '')].amount)/sum(df.amount))

In [ ]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

In [ ]:
stateonly_df = df[(df.city == '') & (df.zipcode_initial == '')].copy()
stateonly_df.state = ''

## Move the rows with just the state over to the noloc_df dataset
noloc_df = pd.concat([noloc_df, stateonly_df])
df = df[~((df.city == '') & (df.zipcode_initial == ''))].copy()

In [ ]:
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

In [ ]:
print 100*sum(df[df.city == ''].amount)/sum(df.amount)

print len(df[df.city == '']), len(df[df.zipcode_initial == ''])
print sum(df[df.city == ''].amount), sum(df[df.zipcode_initial == ''].amount)
print sum(df[(df.city == '') & (df.zipcode_initial != '')].amount),\
    sum(df[(df.city != '') & (df.zipcode_initial == '')].amount)

print sum(df.amount)

Investigating empty city and empty state with non-empty zip code

Since we have the zip code data from the US census data, we can use that to fill in the city and state


In [ ]:
## Zip codes from ftp://ftp.census.gov/econ2013/CBP_CSV/zbp13totals.zip
zipcodes = pd.read_csv('in/zbp13totals.txt', dtype={'zip': object})
zipcodes = zipcodes[['zip', 'city', 'stabbr']]
zipcodes = zipcodes.rename(columns = {'zip':'zipcode', 'stabbr': 'state', 'city': 'city'})
zipcodes.city = zipcodes.city.str.title()
zipcodes.zipcode = zipcodes.zipcode.astype('str')

In [ ]:
## If we know the zip code, we can populate the city by using the zipcodes data
df.replace({'city': {'': np.nan}, 'state': {'': np.nan}}, inplace=True)

## Set the index correctly for update to work. Then reset it back.
df.set_index(['zipcode'], inplace=True)
zipcodes.set_index(['zipcode'], inplace=True)

df.update(zipcodes, join='left', overwrite=False, raise_conflict=False)

In [ ]:
df.reset_index(drop=False, inplace=True)
zipcodes.reset_index(drop=False, inplace=True)

In [ ]:
zipcodesdetail = pd.read_csv('in/zip_code_database.csv')

In [ ]:
zipcodesdetail = zipcodesdetail[zipcodesdetail.country == 'US'][['zip', 'primary_city', 'county', 'state', 'timezone', 'latitude', 'longitude']]
zipcodesdetail = zipcodesdetail.rename(columns = {'zip':'zipcode', 'primary_city': 'city'})

In [ ]:
# The zip codes dataset has quite a few missing values. Filling in what we need for now.
# If this happens again, search for a different data source!!
zipcodesdetail.loc[(zipcodesdetail.city == 'Frisco') & (zipcodesdetail.state == 'TX') & (pd.isnull(zipcodesdetail.county)), 'county'] = 'Denton'

In [ ]:
# Strip the ' County' portion from the county names
def getcounty(county):
    if pd.isnull(county):
        return county
    elif county.endswith(' County'):
        return county[:-7]
    else:
        return county

zipcodesdetail.county = zipcodesdetail['county'].apply(getcounty)

In [ ]:
zipcodesdetail.zipcode = zipcodesdetail.zipcode.apply(fill_zipcode)

In [ ]:
newcols = np.array(list(set(df.columns).union(zipcodesdetail.columns)))

In [ ]:
df = pd.merge(df, zipcodesdetail, on=['state', 'city', 'zipcode'], how='inner', suffixes=('_x', ''))[newcols]

In [ ]:
# For some reason, the data types are being reset. So setting them back to their expected data types.
df.donation_date = df.donation_date.apply(pd.to_datetime)
df.charitable = df.charitable.apply(bool)
df.amount = df.amount.apply(int)

Investigate invalid zip codes


In [ ]:
all_zipcodes = pd.merge(df, zipcodes, on='zipcode', how='left')
all_zipcodes[pd.isnull(all_zipcodes.city_x)].head()

In [ ]:
## There seems to be only one row with an invalid zip code. Let's drop it.
df.drop(df[df.zipcode_initial.isin(['GU214ND','94000'])].index, axis=0, inplace=True)

Final check on all location data to confirm that we have no rows with empty state, city or location


In [ ]:
print 'No state: count of rows: ', len(df[df.state == ''].amount),\
      'Total amount: ', sum(df[df.state == ''].amount)
print 'No zipcode: count of rows: ', len(df[df.zipcode == ''].amount),\
      'Total amount: ', sum(df[df.zipcode == ''].amount)
print 'No city: count of rows: ', len(df[df.city == ''].amount),\
      'Total amount: ', sum(df[df.city == ''].amount)

In [ ]:
# Examining data - top 10 states by amount and number of donors
print df.groupby('state')['amount',].sum().sort_values(by='amount', ascending=False)[0:10]
print df.groupby('state')['donor_id',].count().sort_values(by='donor_id', ascending=False)[0:10]

In [ ]:
print noloc_df.state.unique()
print noloc_df.city.unique()
print noloc_df.zipcode.unique()

In [ ]:
noloc_df['city'] = ''
noloc_df['state'] = ''
noloc_df['zipcode'] = ''

In [ ]:
print df.shape[0] + noloc_df.shape[0]

In [ ]:
df.shape, noloc_df.shape

In [ ]:
# The input data has the latest zip code for each donor. So we cannot observe any movement even if there was any since
# all donations by a given donor will only have the same exact zipcode.
x1 = pd.DataFrame(df.groupby(['donor_id','zipcode']).zipcode.nunique())
x1[x1.zipcode != 1]

In [ ]:
# The noloc_df and the df with location values have no donors in common - so we cannot use the donor
# location information from df to detect the location in noloc_df.
set(df.donor_id.values).intersection(noloc_df.donor_id.values)

In [ ]:
df.rename(columns={'donation_date': 'activity_date'}, inplace=True)
df['activity_year'] = df.activity_date.apply(lambda x: x.year)
df['activity_month'] = df.activity_date.apply(lambda x: x.month)
df['activity_dow'] = df.activity_date.apply(lambda x: x.dayofweek)
df['activity_ym'] = df['activity_date'].map(lambda x: 100*x.year + x.month)
df['activity_yq'] = df['activity_date'].map(lambda x: 10*x.year + (x.month-1)//3)
df['activity_ymd'] = df['activity_date'].map(lambda x: 10000*x.year + 100*x.month + x.day)

In [ ]:
# Drop the zipcode_initial (for privacy reasons)
df.drop('zipcode_initial', axis=1, inplace=True)

All done! Let's save our dataframes for the next stage of processing


In [ ]:
!mkdir -p out/0
df.to_pickle('out/0/donations.pkl')
noloc_df.to_pickle('out/0/donations_noloc.pkl')

In [ ]:
df[df.donor_id == '_1D50SWTKX'].sort_values(by='activity_date').tail()

In [ ]:
df.columns

In [ ]: