Extract donor summaries

This step is to extract summary datasets about the donors:

  1. yearly_actual_donors - has the actual donations from each county by year
  2. yearly_expected_donors - This is projecting the mean donations across the entire country on the indian population in that county/year to come up with yearly projections for the donations from those counties in those years.

In [1]:
import pandas as pd
import numpy as np

In [2]:
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')

In [3]:
yearly_india_pop = population.set_index(['county_id', 'county_norm', 'state']).apply(sum)
yearly_donor_pop = donations.groupby('activity_year').donor_id.nunique()
yearly_mean_donor_fraction = yearly_donor_pop*1.0/yearly_india_pop

yearly_expected_donors = population.set_index(['county_id', 'county_norm', 'state']).mul(yearly_mean_donor_fraction, axis=1).astype('int')
yearly_expected_donors = yearly_expected_donors.reset_index()

In [4]:
yearly_actual_donors = donations\
    .groupby(['state', 'county_norm', 'county_id', 'activity_year'])\
    .donor_id\
    .nunique()\
    .to_frame()\
    .unstack()\
    .fillna(0)

# drop the "donor_id" dummy level
yearly_actual_donors.columns = yearly_actual_donors.columns.droplevel()

In [5]:
# rows don't match
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape


Out[5]:
((3102, 19), (753, 16), (3102, 19))

In [6]:
# To make things easier later, we want to set the values for all the counties in the yearly actual donor dataframe
yearly_actual_donors= yearly_actual_donors.reset_index()\
    .merge(population.reset_index()[['state', 'county_norm', 'county_id']],
           on=['state', 'county_norm','county_id'],
           how='right')\
    .sort_values(by='county_norm')\
    .fillna(0)

In [7]:
# rows match after conversion
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape


Out[7]:
((3102, 19), (3102, 19), (3102, 19))

In [8]:
us_counties.head()


Out[8]:
county state lats lons county_norm census_region_name state_name county_id
2016 Autauga AL [32.4757, 32.46599, 32.45054, 32.44245, 32.439... [-86.41182, -86.41177, -86.41167, -86.41157, -... autauga South Alabama 0
1103 Baldwin AL [30.28557, 30.21934, 30.21771, 30.21183, 30.20... [-87.51203, -87.56704, -87.5741, -87.59954, -8... baldwin South Alabama 1
681 Barbour AL [32.02221, 32.02066, 32.0135, 32.00249, 31.996... [-85.04884, -85.05367, -85.05381, -85.06454, -... barbour South Alabama 2
2965 Bibb AL [33.13143, 33.13086, 33.15133, 33.18184, 33.18... [-87.23637, -87.21582, -87.19914, -87.19907, -... bibb South Alabama 3
1521 Blount AL [33.949, 33.95621, 33.9629, 33.97324, 33.99538... [-86.8332, -86.81779, -86.79248, -86.7719, -86... blount South Alabama 4

In [9]:
cols = range(2001, 2017)
cols.extend(['state', 'county_norm', 'county_id'])

In [10]:
yearly_expected_donors = yearly_expected_donors\
    .sort_values(by=['county_id'])\
    .reset_index(drop=True)[cols]

In [11]:
yearly_actual_donors = yearly_actual_donors\
    .sort_values(by=['county_id'])\
    .reset_index(drop=True)[cols]

In [12]:
yearly_actual_donors.head()


Out[12]:
activity_year 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 state county_norm county_id
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL autauga 0
1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 AL baldwin 1
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL barbour 2
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL bibb 3
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 AL blount 4

In [13]:
!mkdir -p out/40
yearly_expected_donors.to_pickle('out/40/yearly_expected_donors.pkl')
yearly_actual_donors.to_pickle('out/40/yearly_actual_donors.pkl')

Testing out queries for the web app


In [14]:
indian_population = population

In [15]:
yearly_actual_donors.head()


Out[15]:
activity_year 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 state county_norm county_id
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL autauga 0
1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 AL baldwin 1
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL barbour 2
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL bibb 3
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 AL blount 4

In [16]:
statesfilter = ['CA', 'WA']
yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
_ = indian_population[indian_population.state.isin(statesfilter)]

In [17]:
yearly_expected_donors.head()


Out[17]:
2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 state county_norm county_id
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL autauga 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL baldwin 1
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL barbour 2
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL bibb 3
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 AL blount 4

In [18]:
actual =  yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
expected =  yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
population =  indian_population[indian_population.state.isin(statesfilter)]

In [19]:
difference = actual.set_index(['state', 'county_norm']) - expected.set_index(['state', 'county_norm'])

In [20]:
population[population.county_norm=='snohomish']


Out[20]:
state county_norm 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 county_id
2943 WA snohomish 1329 1700 2071 2443 2814 3185 3556 3928 4299 4670 5041 5413 5784 6155 6526 6898 2943

In [21]:
population =  indian_population[indian_population.state.isin(statesfilter)]

In [22]:
year = 2014
statesfilter = ['AZ', 'CA', 'CO', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']
print statesfilter
population[population.state.isin(statesfilter)][year]
population.query('state in @statesfilter')[['county_norm', year]]


['AZ', 'CA', 'CO', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']
Out[22]:
county_norm 2014
157 alameda 65143
158 alpine 0
159 amador 16
160 butte 545
161 calaveras 18
162 colusa 0
163 contracosta 19034
164 delnorte 0
165 eldorado 1251
166 fresno 12994
167 glenn 9
168 humboldt 171
169 imperial 242
170 inyo 7
171 kern 7316
172 kings 475
173 lake 158
174 lassen 4
175 losangeles 54598
176 madera 1392
177 marin 1576
178 mariposa 6
179 mendocino 368
180 merced 4083
181 modoc 0
182 mono 0
183 monterey 1347
184 napa 589
185 nevada 156
186 orange 29172
... ... ...
2922 ferry 4
2923 franklin 129
2924 garfield 1
2925 grant 173
2926 graysharbor 34
2927 island 5
2928 jefferson 6
2929 king 45633
2930 kitsap 241
2931 kittitas 0
2932 klickitat 0
2933 lewis 0
2934 lincoln 0
2935 mason 0
2936 okanogan 0
2937 pacific 12
2938 pendoreille 0
2939 pierce 1902
2940 sanjuan 16
2941 skagit 44
2942 skamania 9
2943 snohomish 6155
2944 spokane 380
2945 stevens 0
2946 thurston 1139
2947 wahkiakum 0
2948 wallawalla 36
2949 whatcom 844
2950 whitman 416
2951 yakima 171

97 rows × 2 columns


In [23]:
population.county_norm.nunique()


Out[23]:
97

In [24]:
indian_population[(indian_population.state=='WA') & (indian_population.county_norm=='king')]


Out[24]:
state county_norm 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 county_id
2929 WA king 6376 9396 12416 15436 18455 21475 24495 27515 30534 33554 36574 39594 42613 45633 48653 51673 2929

In [25]:
us_counties[(us_counties.state=='WA') & (us_counties.county_norm=='king')]


Out[25]:
county state lats lons county_norm census_region_name state_name county_id
863 King WA [47.25792, 47.25792, 47.30373, 47.40299, 47.40... [-122.33363, -122.33491, -122.39325, -122.5367... king West Washington 2929

In [26]:
donations[(donations.state=='WA') & (donations.county_norm=='king')].county_id.unique()


Out[26]:
array([2929], dtype=int64)

In [ ]: