Data cleanup

This notebook is meant for cleaning up the donation data.

The following is a summary of some of the cleanup tasks from this notebook:

Load the csv file that has the donors
Strip out whitespace from all the columns
Fill na with empty strings
Change column data types (after examining for correctness)
Cleanup amounts column - removed negative (totals to -641910.46 dollars) and zero values
Cleanup state codes.
Removed donations that are outside of US - about \$30,000 USD
Removed donations totaling to 9.5 million dollars that came from anonymous donors (as outliers)
If there is no location information or it is inaccurate, move it to a different, move it to a different dataframe
Update the city and state names when not present based on the zipcodes dataset.



In [1]:

    
import pandas as pd
import numpy as np
import locale
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

%matplotlib inline
from bokeh.plotting import output_notebook
output_notebook()









    




    


    

    
        
        BokehJS successfully loaded.



In [3]:

    
_ = locale.setlocale(locale.LC_ALL, '')
thousands_sep = lambda x: locale.format("%.2f", x, grouping=True)
#example:
print thousands_sep(1234567890.76543)

getdate_ym = lambda x: str(x.year) + "_" + str(x.month)
getdate_ymd = lambda x: str(x.month) + "/" + str(x.day) + "/" + str(x.year)
dates = pd.DatetimeIndex(['2010-10-17', '2011-05-13', "2012-01-15"])
map(getdate_ym, dates)
map(getdate_ymd, dates)









    



1,234,567,890.77






    Out[3]:





['10/17/2010', '5/13/2011', '1/15/2012']

Load csv



In [4]:

    
#df = pd.read_csv('sef_donors_sample.csv', skipinitialspace=True, sep=',\s*', engine='python', quotechar='"')
df = pd.read_csv('in/sef_donors_edit.csv')
source_columns = ['donor_id', 'amount_initial', 'donation_date', 'appeal', 'fund', 'city', 'state', 'zipcode_initial', 'batch_num', 'charitable', 'sales']
df.columns = source_columns









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-bee809a70948> in <module>()
      1 #df = pd.read_csv('sef_donors_sample.csv', skipinitialspace=True, sep=',\s*', engine='python', quotechar='"')
----> 2 df = pd.read_csv('in/sef_donors_edit.csv')
      3 source_columns = ['donor_id', 'amount_initial', 'donation_date', 'appeal', 'fund', 'city', 'state', 'zipcode_initial', 'batch_num', 'charitable', 'sales']
      4 df.columns = source_columns

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/home/msharath/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File in/sef_donors_edit.csv does not exist



In [ ]:

    
df.info()



In [ ]:

    
strip_func = lambda x: x.strip() if isinstance(x, str) else x
df = df.applymap(strip_func)

Address nan column values



In [ ]:

    
df.replace({'appeal': {'0': ''}}, inplace=True)
df.appeal.fillna('', inplace=True)
df.fund.fillna('', inplace=True)

Change column types and drop unused columns



In [ ]:

    
df.donation_date = pd.to_datetime(df.donation_date)
df.batch_num = df.batch_num.astype('str')
df.charitable = df.charitable.astype('bool')
df['zipcode'] = df.zipcode_initial.str[0:5]



In [ ]:

    
df['zipcode'] = df.zipcode_initial.str[0:5]



In [ ]:

    
fill_zipcode = lambda x: '0'*(5-len(str(x))) + str(x)
x1 = pd.DataFrame([[1, '8820'], [2, 8820]], columns=['a','b'])
x1.b = x1.b.apply(fill_zipcode)
x1



In [ ]:

    
df.zipcode = df.zipcode.apply(fill_zipcode)

Cleanup amounts



In [ ]:

    
## Ensure that all amounts are dollar figures
df[~df.amount_initial.str.startswith('-$') & ~df.amount_initial.str.startswith('$')]



In [ ]:

    
df['amount_cleanup'] = df.amount_initial.str.replace(',', '')
df['amount_cleanup'] = df.amount_cleanup.str.replace('$', '')
df['amount'] = df.amount_cleanup.astype(float)



In [ ]:

    
## Make sure we did not throw away valid numbers by checking with the original value
df[(df.amount == 0)].amount_initial.unique()



In [ ]:

    
## drop row with invalid data
df.drop(df[df.donation_date == '1900-01-01'].index, axis=0, inplace=True)

Outlier data



In [ ]:

    
# There are some outliers in the data, quite a few of them are recent.
_ = plt.scatter(df[df.amount > 5000].amount.values, df[df.amount > 5000].donation_date.values)
plt.show()



In [ ]:

    
# Fun little thing to try out bokeh (we can hover and detect the culprits)
def plot_data(df):
    dates = map(getdate_ym, pd.DatetimeIndex(df[df.amount > 5000].donation_date))
    amounts = map(thousands_sep, df[df.amount > 5000].amount)
    x = df[df.amount > 5000].donation_date.values
    y = df[df.amount > 5000].amount.values
    donor_ids = df[df.amount > 5000].donor_id.values
    states = df[df.amount > 5000].state.values

    source = ColumnDataSource(
            data=dict(
                x=x,
                y=y,
                dates=dates,
                amounts=amounts,
                donor_ids=donor_ids,
                states=states,
            )
        )

    hover = HoverTool(
            tooltips=[
                ("date", "@dates"),
                ("amount", "@amounts"),
                ("donor", "@donor_ids"),
                ("states", "@states"),
            ]
        )

    p = figure(plot_width=400, plot_height=400, title=None, tools=[hover])
    p.circle('x', 'y', size=5, source=source)

    show(p)



In [ ]:

    
plot_data(df.query('amount > 5000'))



In [ ]:

    
# All the Outliers seem to have the following properties: state == YY and specific donorid.
# Plot the remaining data outside of these to check that we caught all the outliers.
plot_data(df[~df.index.isin(df.query('state == "YY" and amount > 5000').index)])



In [ ]:

    
# Outlier data
df[(df.state == 'YY') & (df.amount >= 45000)]



In [ ]:

    
df[(df.state == 'YY') & (df.amount >= 45000)]\
    .sort_values(by='amount', ascending=False)\
    .head(6)[source_columns]\
    .to_csv('out/0/outlier_data.csv')

Exchanged emails with Anil and confirmed the decision to drop the outlier for the anonymous donor with the 9.5 million dollars.



In [ ]:

    
df.drop(df[(df.state == 'YY') & (df.amount >= 45000)].index, inplace=True)



In [ ]:

    
print 'After dropping the anonymous donor, total amounts from the unknown state as a percentage of all amounts is: '\
    , thousands_sep(100*df[(df.state == 'YY')].amount.sum()/df.amount.sum()), '%'

Amounts with zero values



In [ ]:

    
## Some funds have zero amounts associated with them.
## They mostly look like costs - expense fees, transaction fees, administrative fees
## Let us examine if we can safely drop them from our analysis

df[df.amount_initial == '$0.00'].groupby(['fund', 'appeal'])['donor_id'].count()

Dropping rows with zero amounts (after confirmation with SEF office)



In [ ]:

    
df.drop(df[df.amount == 0].index, axis=0, inplace=True)

Negative amounts



In [ ]:

    
## What is the total amount of the negative? (-640910.46 dollars)
print 'Total negative amount is: ', df[df.amount < 0].amount.sum()



In [ ]:

    
# Add if condition to make this re-runnable
if df[df.amount < 0].amount.sum() > 0:
    print 'Amounts grouped by fund and appeal, sorted by most negative amounts'
    df[df.amount < 0]\
        .groupby(['fund', 'appeal'])['amount',]\
        .sum()\
        .sort_values(by='amount')\
        .to_csv('out/0/negative_amounts_sorted.csv')

    df[df.amount < 0]\
        .groupby(['fund', 'appeal'])['amount',]\
        .sum()\
        .to_csv('out/0/negative_amounts_grouped_by_fund.csv')

Dropping rows with negative amounts (after confirmation with SEF office)



In [ ]:

    
df.drop(df[df.amount < 0].index, axis=0, inplace=True)

Investigate invalid state codes



In [ ]:

    
df.info()



In [ ]:

    
df.state.unique()



In [ ]:

    
## States imported from http://statetable.com/
states = pd.read_csv('in/state_table.csv')
states.rename(columns={'abbreviation': 'state'}, inplace=True)



In [ ]:

    
all_states = pd.merge(states, pd.DataFrame(df.state.unique(), columns=['state']), on='state', how='right')
invalid_states = all_states[pd.isnull(all_states.id)].state



In [ ]:

    
df[df.state.isin(invalid_states)].state.value_counts()



In [ ]:

    
df[df.state.isin(['VI', 'PR', 'Ny' 'HY', 'BC','IO', 'AB', 'Ca', 'Co', 'CF', 'ja', 'UK', 'KA'])]



In [ ]:

    
%%html
<style>table {float:left}</style>

Explanation for invalid state codes:

State	Count	Action	Explanation
YY	268	None	All these rows are bogus entries (City and Zip are also YYYYs) - about 20% of the donation amount has this
ON	62	Remove	This is the state of Ontario, Canada
AP	18	Remove	This is data for Hyderabad
VI	6	Remove	Virgin Islands
PR	5	Remove	Peurto Rico
Ny	5	NY	Same as NY - rename Ny as NY
56	1	Remove	This is one donation from Bangalore, Karnataka
HY	1	Remove	Hyderabad
BC	1	Remove	British Columbia, Canada
IO	1	IA	Changed to Iowa - based on city and zip code
AB	1	Remove	AB stands for Alberta, Canada
Ca	1	CA	Same as California - rename Ca to CA
Co	1	CO	Same as Colarado - rename Co to CO
CF	1	FL	Changed to Florida based on zip code and city
ja	1	FL	Change to FL based on zip code and city
UK	1	Remove	London, UK
KA	1	Remove	Bangalore, Karnataka



In [ ]:

    
state_renames = {'Ny': 'NY', 'IO': 'IA', 'Ca' : 'CA', 'Co' : 'CO', 'CF' : 'FL', 'ja' : 'FL'}
df.replace({'state': state_renames}, inplace=True)

Dropping data for non-US locations



In [ ]:

    
non_usa_states = ['ON', 'AP', 'VI', 'PR', '56', 'HY', 'BC', 'AB', 'UK', 'KA']
print 'Total amount for locations outside USA: ', sum(df[df.state.isin(non_usa_states)].amount)
#### Total amount for locations outside USA:  30576.34



In [ ]:

    
df.drop(df[df.state.isin(non_usa_states)].index, axis=0, inplace=True)

Investigate donations with state of YY



In [ ]:

    
print 'Percentage of amount for unknown (YY) state : {:.2f}'.format(100*df[df.state == 'YY'].amount.sum()/df.amount.sum())



In [ ]:

    
print 'Total amount for the unknown state excluding outliers: ', df[(df.state == 'YY') & (df.amount < 45000)].amount.sum()
print 'Total amount for the unknown state: ', df[(df.state == 'YY')].amount.sum()
print 'Total amount: ', df.amount.sum()

We will add these donations to the noloc_df below (which is the donations that have empty strings for the city/state/zipcode.

Investigate empty city, state and zip code

Pecentage of total amount from donations with no location: 3.087

Moving all the data with no location to a different dataframe.

We will investigate the data that does have location information for correctness of location and then merge the no location data back at the end.



In [ ]:

    
print 'Pecentage of total amount from donations with no location: ', 100*sum(df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].amount)/sum(df.amount)



In [ ]:

    
noloc_df = df[(df.city == '') & (df.state == '') & (df.zipcode_initial == '')].copy()
df = df[~((df.city == '') & (df.state == '') & (df.zipcode_initial == ''))].copy()



In [ ]:

    
print df.shape[0] + noloc_df.shape[0]



In [ ]:

    
noloc_df = noloc_df.append(df[(df.state == 'YY')])
df = df[~(df.state == 'YY')]



In [ ]:

    
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

Investigate City in ('YYY','yyy')

These entries have invalid location information and will be added to the noloc_df dataframe.



In [ ]:

    
noloc_df = noloc_df.append(df[(df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy')])
df = df[~((df.city.str.lower() == 'yyy') | (df.city.str.lower() == 'yyyy'))]



In [ ]:

    
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]

Investigate empty state but non-empty city

Percentage of total amount for data with City but no state: 0.486



In [ ]:

    
print 'Percentage of total amount for data with City but no state: {:.3f}'.format(100*sum(df[df.state == ''].amount)/sum(df.amount))
df[((df.state == '') & (df.city != ''))][['city','zipcode','amount']].sort_values('city', ascending=True).to_csv('out/0/City_No_State.csv')

By visually examining the cities for rows that don't have a state, we can see that all the cities are coming from Canada and India and some from other countries (except two entries). So we will correct these two entries and drop all the other rows as they are not relevant to the USA.



In [ ]:

    
index = df[(df.donor_id == '-28K0T47RF') & (df.donation_date == '2007-11-30') & (df.city == 'Cupertino')].index
df.ix[index,'state'] = 'CA'
index = df[(df.donor_id == '9F4812A118') & (df.donation_date == '2012-06-30') & (df.city == 'San Juan')].index
df.ix[index,'state'] = 'WA'
df.ix[index,'zipcode'] = 98250



In [ ]:

    
# Verified that these remaining entries are for non-US location
print 'Total amount for non-USA location: ', df[((df.state == '') & (df.city != ''))].amount.sum()



In [ ]:

    
df.drop(df[((df.state == '') & (df.city != ''))].index, inplace=True)

Investigate empty city and zipcode but valid US state

Percentage of total amount for data with valid US state, but no city, zipcode: 4.509

Most of this amount (1.7 of 1.8 million) is coming from about 600 donors in California. We already know that about California is a major contributor to donations.

Although, we can do some analytics based on just the US state using this data, it complicates the analysis that does not substantiate the knowledge gain.

Therefore, we are dropping the state column from these rows and moving over this data to the dataset that has no location (the one that we created earlier) to simplify our analysis.



In [ ]:

    
print 'Percentage of total amount for data with valid US state, but no city, zipcode: {:.3f}'.format(100*sum(df[(df.city == '') & (df.zipcode_initial == '')].amount)/sum(df.amount))



In [ ]:

    
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]



In [ ]:

    
stateonly_df = df[(df.city == '') & (df.zipcode_initial == '')].copy()
stateonly_df.state = ''

## Move the rows with just the state over to the noloc_df dataset
noloc_df = pd.concat([noloc_df, stateonly_df])
df = df[~((df.city == '') & (df.zipcode_initial == ''))].copy()



In [ ]:

    
# Verify that we transferred all the rows over correctly. This total must match the total from above.
print df.shape[0] + noloc_df.shape[0]



In [ ]:

    
print 100*sum(df[df.city == ''].amount)/sum(df.amount)

print len(df[df.city == '']), len(df[df.zipcode_initial == ''])
print sum(df[df.city == ''].amount), sum(df[df.zipcode_initial == ''].amount)
print sum(df[(df.city == '') & (df.zipcode_initial != '')].amount),\
    sum(df[(df.city != '') & (df.zipcode_initial == '')].amount)

print sum(df.amount)

Investigating empty city and empty state with non-empty zip code

Since we have the zip code data from the US census data, we can use that to fill in the city and state



In [ ]:

    
## Zip codes from ftp://ftp.census.gov/econ2013/CBP_CSV/zbp13totals.zip
zipcodes = pd.read_csv('in/zbp13totals.txt', dtype={'zip': object})
zipcodes = zipcodes[['zip', 'city', 'stabbr']]
zipcodes = zipcodes.rename(columns = {'zip':'zipcode', 'stabbr': 'state', 'city': 'city'})
zipcodes.city = zipcodes.city.str.title()
zipcodes.zipcode = zipcodes.zipcode.astype('str')



In [ ]:

    
## If we know the zip code, we can populate the city by using the zipcodes data
df.replace({'city': {'': np.nan}, 'state': {'': np.nan}}, inplace=True)

## Set the index correctly for update to work. Then reset it back.
df.set_index(['zipcode'], inplace=True)
zipcodes.set_index(['zipcode'], inplace=True)

df.update(zipcodes, join='left', overwrite=False, raise_conflict=False)



In [ ]:

    
df.reset_index(drop=False, inplace=True)
zipcodes.reset_index(drop=False, inplace=True)



In [ ]:

    
zipcodesdetail = pd.read_csv('in/zip_code_database.csv')



In [ ]:

    
zipcodesdetail = zipcodesdetail[zipcodesdetail.country == 'US'][['zip', 'primary_city', 'county', 'state', 'timezone', 'latitude', 'longitude']]
zipcodesdetail = zipcodesdetail.rename(columns = {'zip':'zipcode', 'primary_city': 'city'})



In [ ]:

    
# The zip codes dataset has quite a few missing values. Filling in what we need for now.
# If this happens again, search for a different data source!!
zipcodesdetail.loc[(zipcodesdetail.city == 'Frisco') & (zipcodesdetail.state == 'TX') & (pd.isnull(zipcodesdetail.county)), 'county'] = 'Denton'



In [ ]:

    
# Strip the ' County' portion from the county names
def getcounty(county):
    if pd.isnull(county):
        return county
    elif county.endswith(' County'):
        return county[:-7]
    else:
        return county

zipcodesdetail.county = zipcodesdetail['county'].apply(getcounty)



In [ ]:

    
zipcodesdetail.zipcode = zipcodesdetail.zipcode.apply(fill_zipcode)



In [ ]:

    
newcols = np.array(list(set(df.columns).union(zipcodesdetail.columns)))



In [ ]:

    
df = pd.merge(df, zipcodesdetail, on=['state', 'city', 'zipcode'], how='inner', suffixes=('_x', ''))[newcols]



In [ ]:

    
# For some reason, the data types are being reset. So setting them back to their expected data types.
df.donation_date = df.donation_date.apply(pd.to_datetime)
df.charitable = df.charitable.apply(bool)
df.amount = df.amount.apply(int)

Investigate invalid zip codes



In [ ]:

    
all_zipcodes = pd.merge(df, zipcodes, on='zipcode', how='left')
all_zipcodes[pd.isnull(all_zipcodes.city_x)].head()



In [ ]:

    
## There seems to be only one row with an invalid zip code. Let's drop it.
df.drop(df[df.zipcode_initial.isin(['GU214ND','94000'])].index, axis=0, inplace=True)

Final check on all location data to confirm that we have no rows with empty state, city or location



In [ ]:

    
print 'No state: count of rows: ', len(df[df.state == ''].amount),\
      'Total amount: ', sum(df[df.state == ''].amount)
print 'No zipcode: count of rows: ', len(df[df.zipcode == ''].amount),\
      'Total amount: ', sum(df[df.zipcode == ''].amount)
print 'No city: count of rows: ', len(df[df.city == ''].amount),\
      'Total amount: ', sum(df[df.city == ''].amount)



In [ ]:

    
# Examining data - top 10 states by amount and number of donors
print df.groupby('state')['amount',].sum().sort_values(by='amount', ascending=False)[0:10]
print df.groupby('state')['donor_id',].count().sort_values(by='donor_id', ascending=False)[0:10]



In [ ]:

    
print noloc_df.state.unique()
print noloc_df.city.unique()
print noloc_df.zipcode.unique()



In [ ]:

    
noloc_df['city'] = ''
noloc_df['state'] = ''
noloc_df['zipcode'] = ''



In [ ]:

    
print df.shape[0] + noloc_df.shape[0]



In [ ]:

    
df.shape, noloc_df.shape



In [ ]:

    
# The input data has the latest zip code for each donor. So we cannot observe any movement even if there was any since
# all donations by a given donor will only have the same exact zipcode.
x1 = pd.DataFrame(df.groupby(['donor_id','zipcode']).zipcode.nunique())
x1[x1.zipcode != 1]



In [ ]:

    
# The noloc_df and the df with location values have no donors in common - so we cannot use the donor
# location information from df to detect the location in noloc_df.
set(df.donor_id.values).intersection(noloc_df.donor_id.values)



In [ ]:

    
df.rename(columns={'donation_date': 'activity_date'}, inplace=True)
df['activity_year'] = df.activity_date.apply(lambda x: x.year)
df['activity_month'] = df.activity_date.apply(lambda x: x.month)
df['activity_dow'] = df.activity_date.apply(lambda x: x.dayofweek)
df['activity_ym'] = df['activity_date'].map(lambda x: 100*x.year + x.month)
df['activity_yq'] = df['activity_date'].map(lambda x: 10*x.year + (x.month-1)//3)
df['activity_ymd'] = df['activity_date'].map(lambda x: 10000*x.year + 100*x.month + x.day)



In [ ]:

    
# Drop the zipcode_initial (for privacy reasons)
df.drop('zipcode_initial', axis=1, inplace=True)

All done! Let's save our dataframes for the next stage of processing



In [ ]:

    
!mkdir -p out/0
df.to_pickle('out/0/donations.pkl')
noloc_df.to_pickle('out/0/donations_noloc.pkl')



In [ ]:

    
df[df.donor_id == '_1D50SWTKX'].sort_values(by='activity_date').tail()



In [ ]:

    
df.columns



In [ ]: