Experiments with Bokeh

This notebook has some experimentation with bokeh that I created to learn bokeh. This is merely an experimentation platform.


In [596]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
%matplotlib inline

In [2]:
donations = pd.read_pickle('out/21/donations.pkl')
us_states = pd.read_pickle('out/11/states_gps.pkl')
us_counties = pd.read_pickle('out/11/counties_gps.pkl')
population = pd.read_pickle('out/11/indian_population.pkl')

In [3]:
population = population.drop(population[population.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)
donations = donations.drop(donations[donations.state.isin(['AK', 'HI', 'PR', 'GU', 'VI', 'MP', 'AS'])].index, axis=0)

In [4]:
# Color map for non-charitable and charitable
colormap = ['indianred','steelblue']
thousands_formatter = FuncFormatter('{:,.0f}'.format)
sample_donor_id = '_1D50SWTKX'

In [5]:
donations[donations.donor_id == sample_donor_id].head()


Out[5]:
activity_date city fund batch_num amount_initial amount_cleanup zipcode longitude sales county ... county_norm census_region_name state_name county_id is_service channel campaign_location_id is_location_center campaign_month_id is_month_center
131735 2006-08-11 Kenmore Focus_AP_PhaseII 927 $270.00 270.00 98028 -122.24 0 King ... king West Washington 2929 False Payroll NaN NaN NaN NaN
131736 2007-02-22 Kenmore Focus_AP_PhaseII 1198 $270.00 270.00 98028 -122.24 0 King ... king West Washington 2929 False Matching NaN NaN NaN NaN
131737 2010-11-30 Kenmore Project_Punjab 3195 $360.00 360.00 98028 -122.24 0 King ... king West Washington 2929 False Matching NaN NaN NaN NaN
131738 2007-02-22 Kenmore Focus_AP_PhaseII 1198 $270.00 270.00 98028 -122.24 0 King ... king West Washington 2929 False Payroll NaN NaN NaN NaN
131739 2007-11-20 Kenmore Project_Shimoga 1441 $1,479.00 1479.00 98028 -122.24 0 King ... king West Washington 2929 False Matching NaN NaN NaN NaN

5 rows × 33 columns


In [6]:
donations.columns


Out[6]:
Index([u'activity_date', u'city', u'fund', u'batch_num', u'amount_initial',
       u'amount_cleanup', u'zipcode', u'longitude', u'sales', u'county',
       u'charitable', u'amount', u'state', u'donor_id', u'timezone',
       u'latitude', u'appeal', u'activity_year', u'activity_month',
       u'activity_dow', u'activity_ym', u'activity_yq', u'activity_ymd',
       u'county_norm', u'census_region_name', u'state_name', u'county_id',
       u'is_service', u'channel', u'campaign_location_id',
       u'is_location_center', u'campaign_month_id', u'is_month_center'],
      dtype='object')

In [7]:
fig, axes = plt.subplots(2,1, sharex=True, figsize=(12,8))

donations.groupby(['activity_year', 'is_service']).amount.sum().unstack()\
  .plot(ax=axes[0], kind='bar', color=colormap, stacked=True,
       title='How much money is coming in as donations/contributions every year?')
axes[0].get_yaxis().set_major_formatter(thousands_formatter)
axes[0].set_ylabel('Total amount')

donations.groupby(['activity_year', 'is_service']).donor_id.nunique().unstack()\
  .plot(ax=axes[1], kind='bar', color=colormap, stacked=True,
       title='How many donors are donating/contributing every year to SEF?')
axes[1].set_ylabel('Number of distinct donors')
axes[1].set_xlabel('Year of activity (data until Nov-2015)')

plt.show()



In [8]:
cumulative_years = np.cumsum(
    donations.groupby(['activity_year', 'activity_month'])['amount', ]\
        .sum()\
        .unstack()\
        .fillna(0)
    , axis=1, dtype='int64').stack()

In [9]:
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource
from bokeh.models.formatters import NumeralTickFormatter
import calendar
from bokeh.models import HoverTool
import seaborn as sns

def flatten(lst):
    return [item for sublist in lst for item in sublist]

def monthly_amount_multiline_plot(year_month_data, cumulative=True, **kwargs):
    hover = HoverTool(
            tooltips="""
            <div>
                @months, @years
            </div>
            """)

    ylabel = 'Total Amount'
    if cumulative == True:
        ylabel = ylabel + ' (cumulative)'
        data = np.cumsum(year_month_data, axis=1, dtype='int64')
    else:
        data = year_month_data.copy()

    amounts = data.stack()
    years = sorted(data.index.get_level_values(0).unique())
    numyears = len(years)
    months = range(1,13)

    palette = sns.color_palette("muted", numyears).as_hex()
    xs = np.array(months * numyears).reshape(numyears, 12).tolist()
    ys = data.amount.values.reshape(numyears, 12).astype('int32').tolist()
    print xs
    source = ColumnDataSource(
            data=dict(
                x=flatten(xs),
                y=flatten(ys),
                months=calendar.month_abbr[1:]*numyears,
                years=np.repeat(years, 12),
            )
    )

    p = figure(plot_width=600, plot_height=600, x_range=calendar.month_abbr[1:], tools=[hover], **kwargs)
    p.multi_line(xs=xs, ys=ys, line_width=2, line_color=palette)
    p.circle('x', 'y', size=10, source=source)
    p.xaxis.axis_label = 'Month'
    p.yaxis.axis_label = ylabel
    p.yaxis.formatter = NumeralTickFormatter(format='0,0')
    return p

In [10]:
ymdata = donations.groupby(['activity_year', 'activity_month'])['amount', ]\
        .sum()\
        .unstack()\
        .fillna(0)
ymdata.index.get_level_values(0).unique()


Out[10]:
array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015], dtype=int64)

In [11]:
output_notebook()
p  = monthly_amount_multiline_plot(ymdata, cumulative=True, title='Amount donated over the years')
show(p)


BokehJS successfully loaded.
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]