This step is to extract summary datasets about the donors:
In [1]:
import pandas as pd
import numpy as np
In [2]:
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')
In [3]:
yearly_india_pop = population.set_index(['county_id', 'county_norm', 'state']).apply(sum)
yearly_donor_pop = donations.groupby('activity_year').donor_id.nunique()
yearly_mean_donor_fraction = yearly_donor_pop*1.0/yearly_india_pop
yearly_expected_donors = population.set_index(['county_id', 'county_norm', 'state']).mul(yearly_mean_donor_fraction, axis=1).astype('int')
yearly_expected_donors = yearly_expected_donors.reset_index()
In [4]:
yearly_actual_donors = donations\
.groupby(['state', 'county_norm', 'county_id', 'activity_year'])\
.donor_id\
.nunique()\
.to_frame()\
.unstack()\
.fillna(0)
# drop the "donor_id" dummy level
yearly_actual_donors.columns = yearly_actual_donors.columns.droplevel()
In [5]:
# rows don't match
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape
Out[5]:
In [6]:
# To make things easier later, we want to set the values for all the counties in the yearly actual donor dataframe
yearly_actual_donors= yearly_actual_donors.reset_index()\
.merge(population.reset_index()[['state', 'county_norm', 'county_id']],
on=['state', 'county_norm','county_id'],
how='right')\
.sort_values(by='county_norm')\
.fillna(0)
In [7]:
# rows match after conversion
yearly_expected_donors.shape, yearly_actual_donors.shape, population.shape
Out[7]:
In [8]:
us_counties.head()
Out[8]:
In [9]:
cols = range(2001, 2017)
cols.extend(['state', 'county_norm', 'county_id'])
In [10]:
yearly_expected_donors = yearly_expected_donors\
.sort_values(by=['county_id'])\
.reset_index(drop=True)[cols]
In [11]:
yearly_actual_donors = yearly_actual_donors\
.sort_values(by=['county_id'])\
.reset_index(drop=True)[cols]
In [12]:
yearly_actual_donors.head()
Out[12]:
In [13]:
!mkdir -p out/40
yearly_expected_donors.to_pickle('out/40/yearly_expected_donors.pkl')
yearly_actual_donors.to_pickle('out/40/yearly_actual_donors.pkl')
In [14]:
indian_population = population
In [15]:
yearly_actual_donors.head()
Out[15]:
In [16]:
statesfilter = ['CA', 'WA']
yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
_ = indian_population[indian_population.state.isin(statesfilter)]
In [17]:
yearly_expected_donors.head()
Out[17]:
In [18]:
actual = yearly_actual_donors[yearly_actual_donors.state.isin(statesfilter)]
expected = yearly_expected_donors[yearly_expected_donors.state.isin(statesfilter)]
population = indian_population[indian_population.state.isin(statesfilter)]
In [19]:
difference = actual.set_index(['state', 'county_norm']) - expected.set_index(['state', 'county_norm'])
In [20]:
population[population.county_norm=='snohomish']
Out[20]:
In [21]:
population = indian_population[indian_population.state.isin(statesfilter)]
In [22]:
year = 2014
statesfilter = ['AZ', 'CA', 'CO', 'ID', 'MT', 'NM', 'NV', 'OR', 'UT', 'WA', 'WY']
print statesfilter
population[population.state.isin(statesfilter)][year]
population.query('state in @statesfilter')[['county_norm', year]]
Out[22]:
In [23]:
population.county_norm.nunique()
Out[23]:
In [24]:
indian_population[(indian_population.state=='WA') & (indian_population.county_norm=='king')]
Out[24]:
In [25]:
us_counties[(us_counties.state=='WA') & (us_counties.county_norm=='king')]
Out[25]:
In [26]:
donations[(donations.state=='WA') & (donations.county_norm=='king')].county_id.unique()
Out[26]:
In [ ]: