In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models import HoverTool
from colour import Color
In [9]:
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')
mainland_states = us_states[~us_states.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])]
mainland_counties = us_counties[~us_counties.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])]
In [5]:
def thousands_format(num):
if abs(num) >= 1000000:
num /= 1000000.0
return "{:5.2f}M".format(num)
elif abs(num) >= 1000:
num /= 1000.0
return "{:5.2f}K".format(num)
else:
return str(num)
pd.Series([12345, 123, 1234567890]).apply(thousands_format)
Out[5]:
In [16]:
# Exclude fund-raiser contributions for the remainder of this exercise
donations = donations[donations.is_service==False]
In [33]:
# Get YTD data for this year (and add comparison with last year)
curr_month = 13
curr_year = 2015
prev_year = curr_year-1
donor_last = donations[(donations.activity_year == prev_year) & (donations.activity_month <= curr_month)]\
.groupby(['state', 'county_norm', 'activity_ym']).donor_id\
.nunique()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'prev_donor'})
donor_curr = donations[(donations.activity_year == curr_year) & (donations.activity_month <= curr_month)]\
.groupby(['state', 'county_norm', 'activity_ym']).donor_id\
.nunique()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'curr_donor'})
amount_prev = donations[(donations.activity_year == prev_year) & (donations.activity_month <= curr_month)]\
.groupby(['state', 'county_norm', 'activity_ym']).amount\
.sum()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'prev_amount'})
amount_curr = donations[(donations.activity_year == curr_year) & (donations.activity_month <= curr_month)]\
.groupby(['state', 'county_norm', 'activity_ym']).amount\
.sum()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'curr_amount'})
data = pd.merge(donor_last, donor_curr, how='outer', left_index=True, right_index=True).fillna(0)
data = pd.merge(data, amount_prev, left_index=True, right_index=True, how='outer').fillna(0).astype('int')
data = pd.merge(data, amount_curr, left_index=True, right_index=True, how='outer').fillna(0).astype('int')
pop = population.set_index(['state', 'county_norm'])[[prev_year, curr_year]]
pop_ratio = (pop[curr_year]/pop[prev_year]).fillna(0)
data['difference'] = data['curr_donor'] - data['prev_donor'].rmul(pop_ratio, axis='rows').fillna(0)
data = data.reindex(population.set_index(['state', 'county_norm']).index, fill_value=0)
print data.loc['WA','king']
print data.loc['CA','fresno']
print data.loc['AL','autauga']
In [34]:
def plot_year_comparison_choropleth(data, ncat, title, curr_year):
prev_year = curr_year - 1
states = mainland_states.copy()
counties = mainland_counties.copy()
state_xs = states.lons.tolist()
state_ys = states.lats.tolist()
county_xs = counties.lons.tolist()
county_ys = counties.lats.tolist()
# Get color for all counties
zero_min = -5
zero_max = 5
pos_palette = [Color(rgb=x).hex for x in sns.color_palette("Blues", ncat)]
neg_palette = [Color(rgb=x).hex for x in sns.color_palette("Reds_r", ncat)]
pos_diffs = data[data.difference > zero_max]
pos_labels = pd.qcut(pos_diffs.difference, ncat, labels=range(ncat))
pos_colors = pos_labels.map(pd.Series(pos_palette))
neg_diffs = data[data.difference < zero_min]
neg_labels = pd.qcut(neg_diffs.difference, ncat, labels=range(ncat))
neg_colors = neg_labels.map(pd.Series(neg_palette))
zero_diffs = data[(data.difference >= zero_min) & (data.difference <= zero_max)]
zero_colors = pd.Series(data='#ffffff', index=zero_diffs.index)
all_colors = pd.concat([neg_colors, zero_colors, pos_colors]).sort_index()
source = ColumnDataSource(
data = dict(
xs=county_xs,
ys=county_ys,
counties=counties.county.values.tolist(),
colors=all_colors.values.flatten().tolist(),
prev_donor=data.prev_donor.tolist(),
curr_donor=data.curr_donor.tolist(),
curr_pop=population[curr_year].apply(thousands_format).values.tolist(),
prev_pop=population[prev_year].apply(thousands_format).values.tolist(),
curr_amount=data.curr_amount.apply(thousands_format).values.tolist(),
prev_amount=data.prev_amount.apply(thousands_format).values.tolist(),
)
)
output_file("donations_year_comparison.html", title=title)
p = figure(title=title, toolbar_location="left",plot_width=1100, plot_height=700,
tools='hover, pan, wheel_zoom, box_zoom, reset, help')
hover = p.select(dict(type=HoverTool))
hover.point_policy = "follow_mouse"
hover.tooltips="""
<div>
<div>
<span style="font-size: 15px;"><b>@counties</b></span>
</div>
<table style="width:100%">
<tr>
<td></td>
<td><b>{0}</b></td>
<td><b>{1}</b></td>
</tr>
<tr>
<td><b>Indian pop.</b></td>
<td>@prev_pop</td>
<td>@curr_pop</td>
</tr>
<tr>
<td><b>Donors</b></td>
<td>@prev_donor</td>
<td>@curr_donor</td>
</tr>
<tr>
<td><b>Amount</b></td>
<td>@prev_amount</td>
<td>@curr_amount</td>
</tr>
</table>
</div>
""".format(prev_year, curr_year)
# Turn off all axis and grid lines
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_label_text_font_size = '0pt'
p.axis.major_tick_line_color = None
p.axis[0].ticker.num_minor_ticks = 0
p.axis[1].ticker.num_minor_ticks = 0
p.patches(state_xs, state_ys, fill_alpha=0.0, line_color="#696969", line_width=0.75)
# The following must be the last glyph for the hover to function properly
p.patches(xs='xs', ys='ys', fill_color='colors', fill_alpha=0.7,
line_color="#e77575", line_width=0.5, source=source)
return p
In [35]:
p = plot_year_comparison_choropleth(data, 5, 'Donations for 2015 vs 2014', 2015)
show(p)
In [28]:
population[curr_year].sum()
Out[28]:
In [29]:
curr_year = 2015
donor_curr = donations[(donations.activity_year == curr_year)]\
.groupby(['state', 'county_norm', 'activity_ym']).donor_id\
.nunique()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'curr_donor'})
amount_curr = donations[(donations.activity_year == curr_year)]\
.groupby(['state', 'county_norm', 'activity_ym']).amount\
.sum()\
.unstack()\
.fillna(0)\
.sum(axis=1)\
.to_frame()\
.rename(columns={0:'curr_amount'})
data = pd.merge(donor_curr, amount_curr, how='outer', left_index=True, right_index=True).fillna(0)
data = data.reindex(population.set_index(['state', 'county_norm']).index, fill_value=0)
curr_pop = population.set_index(['state', 'county_norm'])[curr_year]
donor_per_capita_ratio = data.curr_donor.sum()/curr_pop.sum()
amount_per_capita_ratio = data.curr_amount.sum()/curr_pop.sum()
data['projected_donor'] = (curr_pop*donor_per_capita_ratio).astype('int')
data['projected_amount'] = (curr_pop*amount_per_capita_ratio).astype('int')
data['difference'] = data.curr_donor - data.projected_donor
print 'Amount donated per Indian: {0}'.format(amount_per_capita_ratio)
print 'Number of donors per Indian: {0}'.format(donor_per_capita_ratio)
print data.loc['WA','king']
print data.loc['CA','fresno']
print data.loc['AL','autauga']
In [30]:
def plot_market_potential_choropleth(data, ncat, title, curr_year):
states = mainland_states.copy()
counties = mainland_counties.copy()
state_xs = states.lons.tolist()
state_ys = states.lats.tolist()
county_xs = counties.lons.tolist()
county_ys = counties.lats.tolist()
# Get color for all counties
zero_min = -5
zero_max = 5
pos_palette = [Color(rgb=x).hex for x in sns.color_palette("Blues", ncat)]
neg_palette = [Color(rgb=x).hex for x in sns.color_palette("Reds_r", ncat)]
pos_diffs = data[data.difference > zero_max]
pos_labels = pd.qcut(pos_diffs.difference, ncat, labels=range(ncat))
pos_colors = pos_labels.map(pd.Series(pos_palette))
neg_diffs = data[data.difference < zero_min]
neg_labels = pd.qcut(neg_diffs.difference, ncat, labels=range(ncat))
neg_colors = neg_labels.map(pd.Series(neg_palette))
zero_diffs = data[(data.difference >= zero_min) & (data.difference <= zero_max)]
zero_colors = pd.Series(data='#ffffff', index=zero_diffs.index)
all_colors = pd.concat([neg_colors, zero_colors, pos_colors]).sort_index()
source = ColumnDataSource(
data = dict(
xs=county_xs,
ys=county_ys,
counties=counties.county.values.tolist(),
colors=all_colors.values.flatten().tolist(),
actual_donor=data.curr_donor.tolist(),
actual_amount=data.curr_amount.apply(thousands_format).values.tolist(),
projected_donor=data.projected_donor.values.tolist(),
projected_amount=data.projected_amount.apply(thousands_format).values.tolist(),
actual_pop=population[curr_year].apply(thousands_format).values.tolist(),
)
)
output_file("market_potential.html", title=title)
p = figure(title=title, toolbar_location="left",plot_width=1100, plot_height=700,
tools='hover, pan, wheel_zoom, box_zoom, reset, help')
hover = p.select(dict(type=HoverTool))
hover.point_policy = "follow_mouse"
hover.tooltips="""
<div>
<div>
<span style="font-size: 15px;"><b>@counties</b> (Indian pop.: @actual_pop)</span>
</div>
<table style="width:100%">
<tr>
<td></td>
<td><b>Unique Donors</b></td>
<td><b>Amount</b></td>
</tr>
<tr>
<td><b>Projected</b></td>
<td>@projected_donor</td>
<td>@projected_amount</td>
</tr>
<tr>
<td><b>Actual</b></td>
<td>@actual_donor</td>
<td>@actual_amount</td>
</tr>
</table>
</div>
""".format(prev_year, curr_year)
# Turn off all axis and grid lines
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_label_text_font_size = '0pt'
p.axis.major_tick_line_color = None
p.axis[0].ticker.num_minor_ticks = 0
p.axis[1].ticker.num_minor_ticks = 0
p.patches(state_xs, state_ys, fill_alpha=0.0, line_color="#696969", line_width=0.75)
# The following must be the last glyph for the hover to function properly
p.patches(xs='xs', ys='ys', fill_color='colors', fill_alpha=0.7,
line_color="#e77575", line_width=0.5, source=source)
return p
In [31]:
p = plot_market_potential_choropleth(data, 5, 'Market potential based on donations (2015)', 2015)
show(p)
In [ ]: