Extract donor summaries

This step is to extract summary datasets about the donors:

yearly_actual_donors - has the actual donations from each county by year
yearly_expected_donors - This is projecting the mean donations across the entire country on the indian population in that county/year to come up with yearly projections for the donations from those counties in those years.



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')



In [3]:

    
yearly_india_pop = population.set_index(['county_id', 'county_norm', 'state']).apply(sum)
yearly_donor_pop = donations.groupby('activity_year').donor_id.nunique()
yearly_mean_donor_fraction = yearly_donor_pop*1.0/yearly_india_pop

yearly_expected_donors = population.set_index(['county_id', 'county_norm', 'state']).mul(yearly_mean_donor_fraction, axis=1).astype('int')
yearly_expected_donors = yearly_expected_donors.reset_index()



In [4]:

    
yearly_actual_donors = donations\
    .groupby(['state', 'county_norm', 'county_id', 'activity_year'])\
    .donor_id\
    .nunique()\
    .to_frame()\
    .unstack()\
    .fillna(0)

# drop the "donor_id" dummy level
yearly_actual_donors.columns = yearly_actual_donors.columns.droplevel()



In [5]:

    
# rows don't match
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape









    Out[5]:





((3102, 19), (753, 16), (3102, 19))



In [6]:

    
# To make things easier later, we want to set the values for all the counties in the yearly actual donor dataframe
yearly_actual_donors= yearly_actual_donors.reset_index()\
    .merge(population.reset_index()[['state', 'county_norm', 'county_id']],
           on=['state', 'county_norm','county_id'],
           how='right')\
    .sort_values(by='county_norm')\
    .fillna(0)



In [7]:

    
# rows match after conversion
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape









    Out[7]:





((3102, 19), (3102, 19), (3102, 19))



In [8]:

    
us_counties.head()









    Out[8]:






  
    
      
      county
      state
      lats
      lons
      county_norm
      census_region_name
      state_name
      county_id
    
  
  
    
      2016
      Autauga
      AL
      [32.4757, 32.46599, 32.45054, 32.44245, 32.439...
      [-86.41182, -86.41177, -86.41167, -86.41157, -...
      autauga
      South
      Alabama
      0
    
    
      1103
      Baldwin
      AL
      [30.28557, 30.21934, 30.21771, 30.21183, 30.20...
      [-87.51203, -87.56704, -87.5741, -87.59954, -8...
      baldwin
      South
      Alabama
      1
    
    
      681
      Barbour
      AL
      [32.02221, 32.02066, 32.0135, 32.00249, 31.996...
      [-85.04884, -85.05367, -85.05381, -85.06454, -...
      barbour
      South
      Alabama
      2
    
    
      2965
      Bibb
      AL
      [33.13143, 33.13086, 33.15133, 33.18184, 33.18...
      [-87.23637, -87.21582, -87.19914, -87.19907, -...
      bibb
      South
      Alabama
      3
    
    
      1521
      Blount
      AL
      [33.949, 33.95621, 33.9629, 33.97324, 33.99538...
      [-86.8332, -86.81779, -86.79248, -86.7719, -86...
      blount
      South
      Alabama
      4



In [9]:

    
cols = range(2001, 2017)
cols.extend(['state', 'county_norm', 'county_id'])



In [10]:

    
yearly_expected_donors = yearly_expected_donors\
    .sort_values(by=['county_id'])\
    .reset_index(drop=True)[cols]



In [11]:

    
yearly_actual_donors = yearly_actual_donors\
    .sort_values(by=['county_id'])\
    .reset_index(drop=True)[cols]



In [12]:

    
yearly_actual_donors.head()









    Out[12]:






  
    
      activity_year
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      state
      county_norm
      county_id
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      autauga
      0
    
    
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      AL
      baldwin
      1
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      barbour
      2
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      bibb
      3
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      AL
      blount
      4



In [13]:

    
!mkdir -p out/40
yearly_expected_donors.to_pickle('out/40/yearly_expected_donors.pkl')
yearly_actual_donors.to_pickle('out/40/yearly_actual_donors.pkl')

Testing out queries for the web app



In [14]:

    
indian_population = population



In [15]:

    
yearly_actual_donors.head()









    Out[15]:






  
    
      activity_year
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      state
      county_norm
      county_id
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      autauga
      0
    
    
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      AL
      baldwin
      1
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      barbour
      2
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      bibb
      3
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      AL
      blount
      4



In [16]:

    
statesfilter = ['CA', 'WA']
yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
_ = indian_population[indian_population.state.isin(statesfilter)]



In [17]:

    
yearly_expected_donors.head()









    Out[17]:






  
    
      
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      state
      county_norm
      county_id
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      autauga
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      baldwin
      1
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      barbour
      2
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      bibb
      3
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      AL
      blount
      4



In [18]:

    
actual =  yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
expected =  yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
population =  indian_population[indian_population.state.isin(statesfilter)]



In [19]:

    
difference = actual.set_index(['state', 'county_norm']) - expected.set_index(['state', 'county_norm'])



In [20]:

    
population[population.county_norm=='snohomish']









    Out[20]:






  
    
      
      state
      county_norm
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      county_id
    
  
  
    
      2943
      WA
      snohomish
      1329
      1700
      2071
      2443
      2814
      3185
      3556
      3928
      4299
      4670
      5041
      5413
      5784
      6155
      6526
      6898
      2943



In [21]:

    
population =  indian_population[indian_population.state.isin(statesfilter)]



In [22]:

    
year = 2014
statesfilter = ['AZ', 'CA', 'CO', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']
print statesfilter
population[population.state.isin(statesfilter)][year]
population.query('state in @statesfilter')[['county_norm', year]]









    



['AZ', 'CA', 'CO', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']






    Out[22]:






  
    
      
      county_norm
      2014
    
  
  
    
      157
      alameda
      65143
    
    
      158
      alpine
      0
    
    
      159
      amador
      16
    
    
      160
      butte
      545
    
    
      161
      calaveras
      18
    
    
      162
      colusa
      0
    
    
      163
      contracosta
      19034
    
    
      164
      delnorte
      0
    
    
      165
      eldorado
      1251
    
    
      166
      fresno
      12994
    
    
      167
      glenn
      9
    
    
      168
      humboldt
      171
    
    
      169
      imperial
      242
    
    
      170
      inyo
      7
    
    
      171
      kern
      7316
    
    
      172
      kings
      475
    
    
      173
      lake
      158
    
    
      174
      lassen
      4
    
    
      175
      losangeles
      54598
    
    
      176
      madera
      1392
    
    
      177
      marin
      1576
    
    
      178
      mariposa
      6
    
    
      179
      mendocino
      368
    
    
      180
      merced
      4083
    
    
      181
      modoc
      0
    
    
      182
      mono
      0
    
    
      183
      monterey
      1347
    
    
      184
      napa
      589
    
    
      185
      nevada
      156
    
    
      186
      orange
      29172
    
    
      ...
      ...
      ...
    
    
      2922
      ferry
      4
    
    
      2923
      franklin
      129
    
    
      2924
      garfield
      1
    
    
      2925
      grant
      173
    
    
      2926
      graysharbor
      34
    
    
      2927
      island
      5
    
    
      2928
      jefferson
      6
    
    
      2929
      king
      45633
    
    
      2930
      kitsap
      241
    
    
      2931
      kittitas
      0
    
    
      2932
      klickitat
      0
    
    
      2933
      lewis
      0
    
    
      2934
      lincoln
      0
    
    
      2935
      mason
      0
    
    
      2936
      okanogan
      0
    
    
      2937
      pacific
      12
    
    
      2938
      pendoreille
      0
    
    
      2939
      pierce
      1902
    
    
      2940
      sanjuan
      16
    
    
      2941
      skagit
      44
    
    
      2942
      skamania
      9
    
    
      2943
      snohomish
      6155
    
    
      2944
      spokane
      380
    
    
      2945
      stevens
      0
    
    
      2946
      thurston
      1139
    
    
      2947
      wahkiakum
      0
    
    
      2948
      wallawalla
      36
    
    
      2949
      whatcom
      844
    
    
      2950
      whitman
      416
    
    
      2951
      yakima
      171
    
  

97 rows × 2 columns



In [23]:

    
population.county_norm.nunique()









    Out[23]:





97



In [24]:

    
indian_population[(indian_population.state=='WA') & (indian_population.county_norm=='king')]









    Out[24]:






  
    
      
      state
      county_norm
      2001
      2002
      2003
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
      2014
      2015
      2016
      county_id
    
  
  
    
      2929
      WA
      king
      6376
      9396
      12416
      15436
      18455
      21475
      24495
      27515
      30534
      33554
      36574
      39594
      42613
      45633
      48653
      51673
      2929



In [25]:

    
us_counties[(us_counties.state=='WA') & (us_counties.county_norm=='king')]









    Out[25]:






  
    
      
      county
      state
      lats
      lons
      county_norm
      census_region_name
      state_name
      county_id
    
  
  
    
      863
      King
      WA
      [47.25792, 47.25792, 47.30373, 47.40299, 47.40...
      [-122.33363, -122.33491, -122.39325, -122.5367...
      king
      West
      Washington
      2929



In [26]:

    
donations[(donations.state=='WA') & (donations.county_norm=='king')].county_id.unique()









    Out[26]:





array([2929], dtype=int64)



In [ ]:

	county	state	lats	lons	county_norm	census_region_name	state_name	county_id
2016	Autauga	AL	[32.4757, 32.46599, 32.45054, 32.44245, 32.439...	[-86.41182, -86.41177, -86.41167, -86.41157, -...	autauga	South	Alabama	0
1103	Baldwin	AL	[30.28557, 30.21934, 30.21771, 30.21183, 30.20...	[-87.51203, -87.56704, -87.5741, -87.59954, -8...	baldwin	South	Alabama	1
681	Barbour	AL	[32.02221, 32.02066, 32.0135, 32.00249, 31.996...	[-85.04884, -85.05367, -85.05381, -85.06454, -...	barbour	South	Alabama	2
2965	Bibb	AL	[33.13143, 33.13086, 33.15133, 33.18184, 33.18...	[-87.23637, -87.21582, -87.19914, -87.19907, -...	bibb	South	Alabama	3
1521	Blount	AL	[33.949, 33.95621, 33.9629, 33.97324, 33.99538...	[-86.8332, -86.81779, -86.79248, -86.7719, -86...	blount	South	Alabama	4

	county_norm	2014
157	alameda	65143
158	alpine	0
159	amador	16
160	butte	545
161	calaveras	18
162	colusa	0
163	contracosta	19034
164	delnorte	0
165	eldorado	1251
166	fresno	12994
167	glenn	9
168	humboldt	171
169	imperial	242
170	inyo	7
171	kern	7316
172	kings	475
173	lake	158
174	lassen	4
175	losangeles	54598
176	madera	1392
177	marin	1576
178	mariposa	6
179	mendocino	368
180	merced	4083
181	modoc	0
182	mono	0
183	monterey	1347
184	napa	589
185	nevada	156
186	orange	29172
...	...	...
2922	ferry	4
2923	franklin	129
2924	garfield	1
2925	grant	173
2926	graysharbor	34
2927	island	5
2928	jefferson	6
2929	king	45633
2930	kitsap	241
2931	kittitas	0
2932	klickitat	0
2933	lewis	0
2934	lincoln	0
2935	mason	0
2936	okanogan	0
2937	pacific	12
2938	pendoreille	0
2939	pierce	1902
2940	sanjuan	16
2941	skagit	44
2942	skamania	9
2943	snohomish	6155
2944	spokane	380
2945	stevens	0
2946	thurston	1139
2947	wahkiakum	0
2948	wallawalla	36
2949	whatcom	844
2950	whitman	416
2951	yakima	171