In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models import HoverTool
from colour import Color

In [9]:
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')

mainland_states = us_states[~us_states.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])]
mainland_counties = us_counties[~us_counties.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])]

In [5]:
def thousands_format(num):
    if abs(num) >= 1000000:
        num /= 1000000.0
        return "{:5.2f}M".format(num)
    elif abs(num) >= 1000:
        num /= 1000.0
        return "{:5.2f}K".format(num)
    else:
        return str(num)
    

pd.Series([12345, 123, 1234567890]).apply(thousands_format)


Out[5]:
0      12.35K
1         123
2    1234.57M
dtype: object

Plot 1: Comparing donations for current year with previous year considering donations and unique donors.

  1. Data considered is only for YTD for both years
  2. Color is based on the difference between the actual and expected donations for current year based on previous year while also taking population growth into account.
  3. Blue saturation shows improvement over previous year (higher saturation is better).
  4. Red saturation shows worse performance compared to previous year (higher saturation is worse).

In [16]:
# Exclude fund-raiser contributions for the remainder of this exercise
donations = donations[donations.is_service==False]

In [33]:
# Get YTD data for this year (and add comparison with last year)
curr_month = 13
curr_year = 2015
prev_year = curr_year-1

donor_last = donations[(donations.activity_year == prev_year) & (donations.activity_month <= curr_month)]\
  .groupby(['state', 'county_norm', 'activity_ym']).donor_id\
  .nunique()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'prev_donor'})

donor_curr = donations[(donations.activity_year == curr_year) & (donations.activity_month <= curr_month)]\
  .groupby(['state', 'county_norm', 'activity_ym']).donor_id\
  .nunique()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'curr_donor'})
    
amount_prev = donations[(donations.activity_year == prev_year) & (donations.activity_month <= curr_month)]\
  .groupby(['state', 'county_norm', 'activity_ym']).amount\
  .sum()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'prev_amount'})

amount_curr = donations[(donations.activity_year == curr_year) & (donations.activity_month <= curr_month)]\
  .groupby(['state', 'county_norm', 'activity_ym']).amount\
  .sum()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'curr_amount'})

data = pd.merge(donor_last, donor_curr, how='outer', left_index=True, right_index=True).fillna(0)
data = pd.merge(data, amount_prev, left_index=True, right_index=True, how='outer').fillna(0).astype('int')
data = pd.merge(data, amount_curr, left_index=True, right_index=True, how='outer').fillna(0).astype('int')

pop = population.set_index(['state', 'county_norm'])[[prev_year, curr_year]]
pop_ratio = (pop[curr_year]/pop[prev_year]).fillna(0)

data['difference'] = data['curr_donor'] - data['prev_donor'].rmul(pop_ratio, axis='rows').fillna(0)
data = data.reindex(population.set_index(['state', 'county_norm']).index, fill_value=0)

print data.loc['WA','king']
print data.loc['CA','fresno']
print data.loc['AL','autauga']


prev_donor       1021.00000
curr_donor       1012.00000
prev_amount    279026.00000
curr_amount    274974.00000
difference        -76.56996
Name: (WA, king), dtype: float64
prev_donor        35.00000
curr_donor        29.00000
prev_amount    10110.00000
curr_amount    11060.00000
difference        -7.44913
Name: (CA, fresno), dtype: float64
prev_donor     0
curr_donor     0
prev_amount    0
curr_amount    0
difference     0
Name: (AL, autauga), dtype: float64

In [34]:
def plot_year_comparison_choropleth(data, ncat, title, curr_year):

    prev_year = curr_year - 1
    states = mainland_states.copy()
    counties = mainland_counties.copy()

    state_xs = states.lons.tolist()
    state_ys = states.lats.tolist()
    county_xs = counties.lons.tolist()
    county_ys = counties.lats.tolist()
        
    # Get color for all counties
    zero_min = -5
    zero_max = 5
    pos_palette = [Color(rgb=x).hex for x in sns.color_palette("Blues", ncat)]
    neg_palette = [Color(rgb=x).hex for x in sns.color_palette("Reds_r", ncat)]

    pos_diffs = data[data.difference > zero_max]
    pos_labels = pd.qcut(pos_diffs.difference, ncat, labels=range(ncat))
    pos_colors = pos_labels.map(pd.Series(pos_palette))

    neg_diffs = data[data.difference < zero_min]
    neg_labels = pd.qcut(neg_diffs.difference, ncat, labels=range(ncat))
    neg_colors = neg_labels.map(pd.Series(neg_palette))

    zero_diffs = data[(data.difference >= zero_min) & (data.difference <= zero_max)]
    zero_colors = pd.Series(data='#ffffff', index=zero_diffs.index)
    all_colors = pd.concat([neg_colors, zero_colors, pos_colors]).sort_index()

    source = ColumnDataSource(
            data = dict(
                xs=county_xs,
                ys=county_ys,
                counties=counties.county.values.tolist(),
                colors=all_colors.values.flatten().tolist(),
                prev_donor=data.prev_donor.tolist(),
                curr_donor=data.curr_donor.tolist(),
                curr_pop=population[curr_year].apply(thousands_format).values.tolist(),
                prev_pop=population[prev_year].apply(thousands_format).values.tolist(),
                curr_amount=data.curr_amount.apply(thousands_format).values.tolist(),
                prev_amount=data.prev_amount.apply(thousands_format).values.tolist(),
            )
    )
    
    output_file("donations_year_comparison.html", title=title)
    p = figure(title=title, toolbar_location="left",plot_width=1100, plot_height=700,
               tools='hover, pan, wheel_zoom, box_zoom, reset, help')

    hover = p.select(dict(type=HoverTool))
    hover.point_policy = "follow_mouse"
    hover.tooltips="""
            <div>
                <div>
                    <span style="font-size: 15px;"><b>@counties</b></span>
                </div>
                <table style="width:100%">
                  <tr>
                    <td></td>
                    <td><b>{0}</b></td>
                    <td><b>{1}</b></td>
                  </tr>
                  <tr>
                    <td><b>Indian pop.</b></td>
                    <td>@prev_pop</td>
                    <td>@curr_pop</td>
                  </tr>
                  <tr>
                    <td><b>Donors</b></td>
                    <td>@prev_donor</td>
                    <td>@curr_donor</td>
                  </tr>
                  <tr>
                    <td><b>Amount</b></td>
                    <td>@prev_amount</td>
                    <td>@curr_amount</td>
                  </tr>
                </table>
            </div>
            """.format(prev_year, curr_year)

    # Turn off all axis and grid lines
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_label_text_font_size = '0pt'
    p.axis.major_tick_line_color = None
    p.axis[0].ticker.num_minor_ticks = 0
    p.axis[1].ticker.num_minor_ticks = 0

    p.patches(state_xs, state_ys, fill_alpha=0.0, line_color="#696969", line_width=0.75)
    # The following must be the last glyph for the hover to function properly
    p.patches(xs='xs', ys='ys', fill_color='colors', fill_alpha=0.7,
              line_color="#e77575", line_width=0.5, source=source)
    return p

In [35]:
p = plot_year_comparison_choropleth(data, 5, 'Donations for 2015 vs 2014', 2015)
show(p)

Plot 2: Plot showing market growth potential based on Indian population


In [28]:
population[curr_year].sum()


Out[28]:
2211547

In [29]:
curr_year = 2015

donor_curr = donations[(donations.activity_year == curr_year)]\
  .groupby(['state', 'county_norm', 'activity_ym']).donor_id\
  .nunique()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'curr_donor'})

amount_curr = donations[(donations.activity_year == curr_year)]\
  .groupby(['state', 'county_norm', 'activity_ym']).amount\
  .sum()\
  .unstack()\
  .fillna(0)\
  .sum(axis=1)\
  .to_frame()\
  .rename(columns={0:'curr_amount'})

data = pd.merge(donor_curr, amount_curr, how='outer', left_index=True, right_index=True).fillna(0)
data = data.reindex(population.set_index(['state', 'county_norm']).index, fill_value=0)

curr_pop = population.set_index(['state', 'county_norm'])[curr_year]
donor_per_capita_ratio = data.curr_donor.sum()/curr_pop.sum()
amount_per_capita_ratio = data.curr_amount.sum()/curr_pop.sum()

data['projected_donor'] = (curr_pop*donor_per_capita_ratio).astype('int')
data['projected_amount'] = (curr_pop*amount_per_capita_ratio).astype('int')

data['difference'] = data.curr_donor - data.projected_donor

print 'Amount donated per Indian: {0}'.format(amount_per_capita_ratio)
print 'Number of donors per Indian: {0}'.format(donor_per_capita_ratio)

print data.loc['WA','king']
print data.loc['CA','fresno']
print data.loc['AL','autauga']


Amount donated per Indian: 2.11216537564
Number of donors per Indian: 0.00753092744581
curr_donor            1012
curr_amount         274974
projected_donor        366
projected_amount    102763
difference             646
Name: (WA, king), dtype: float64
curr_donor             29
curr_amount         11060
projected_donor       101
projected_amount    28581
difference            -72
Name: (CA, fresno), dtype: float64
curr_donor          0
curr_amount         0
projected_donor     0
projected_amount    0
difference          0
Name: (AL, autauga), dtype: float64

In [30]:
def plot_market_potential_choropleth(data, ncat, title, curr_year):

    states = mainland_states.copy()
    counties = mainland_counties.copy()

    state_xs = states.lons.tolist()
    state_ys = states.lats.tolist()
    county_xs = counties.lons.tolist()
    county_ys = counties.lats.tolist()
        
    # Get color for all counties
    zero_min = -5
    zero_max = 5
    pos_palette = [Color(rgb=x).hex for x in sns.color_palette("Blues", ncat)]
    neg_palette = [Color(rgb=x).hex for x in sns.color_palette("Reds_r", ncat)]

    pos_diffs = data[data.difference > zero_max]
    pos_labels = pd.qcut(pos_diffs.difference, ncat, labels=range(ncat))
    pos_colors = pos_labels.map(pd.Series(pos_palette))

    neg_diffs = data[data.difference < zero_min]
    neg_labels = pd.qcut(neg_diffs.difference, ncat, labels=range(ncat))
    neg_colors = neg_labels.map(pd.Series(neg_palette))

    zero_diffs = data[(data.difference >= zero_min) & (data.difference <= zero_max)]
    zero_colors = pd.Series(data='#ffffff', index=zero_diffs.index)
    all_colors = pd.concat([neg_colors, zero_colors, pos_colors]).sort_index()

    source = ColumnDataSource(
            data = dict(
                xs=county_xs,
                ys=county_ys,
                counties=counties.county.values.tolist(),
                colors=all_colors.values.flatten().tolist(),
                actual_donor=data.curr_donor.tolist(),
                actual_amount=data.curr_amount.apply(thousands_format).values.tolist(),
                projected_donor=data.projected_donor.values.tolist(),
                projected_amount=data.projected_amount.apply(thousands_format).values.tolist(),
                actual_pop=population[curr_year].apply(thousands_format).values.tolist(),
            )
    )

    output_file("market_potential.html", title=title)
    p = figure(title=title, toolbar_location="left",plot_width=1100, plot_height=700,
               tools='hover, pan, wheel_zoom, box_zoom, reset, help')

    hover = p.select(dict(type=HoverTool))
    hover.point_policy = "follow_mouse"
    hover.tooltips="""
            <div>
                <div>
                    <span style="font-size: 15px;"><b>@counties</b> (Indian pop.: @actual_pop)</span>
                </div>
                <table style="width:100%">
                  <tr>
                    <td></td>
                    <td><b>Unique Donors</b></td>
                    <td><b>Amount</b></td>
                  </tr>
                  <tr>
                    <td><b>Projected</b></td>
                    <td>@projected_donor</td>
                    <td>@projected_amount</td>
                  </tr>
                  <tr>
                    <td><b>Actual</b></td>
                    <td>@actual_donor</td>
                    <td>@actual_amount</td>
                  </tr>
                </table>
            </div>
            """.format(prev_year, curr_year)

    # Turn off all axis and grid lines
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_label_text_font_size = '0pt'
    p.axis.major_tick_line_color = None
    p.axis[0].ticker.num_minor_ticks = 0
    p.axis[1].ticker.num_minor_ticks = 0

    p.patches(state_xs, state_ys, fill_alpha=0.0, line_color="#696969", line_width=0.75)
    # The following must be the last glyph for the hover to function properly
    p.patches(xs='xs', ys='ys', fill_color='colors', fill_alpha=0.7,
              line_color="#e77575", line_width=0.5, source=source)
    return p

In [31]:
p = plot_market_potential_choropleth(data, 5, 'Market potential based on donations (2015)', 2015)
show(p)

In [ ]: