Computation

We should minimize the amount of smoothing done client-side in D3, since this is harder to control and we probably have fewer options. We should precompute, for each country-year.

  • Lorenz on a standard grid (0.0, 1.0, 1000 steps).
  • CDF on a set of standard linear grids (0, 10000, 1000 steps) or (0, 100000, 1000 steps)
  • CDF on a standard log grid (0, 100000, 1000 steps)
  • PDF of a set of standard linear grids as CDF
    • smoothed version of same
  • PDF on a standard log grid (log(1)=0, log(100000)=5, 1000 steps)
    • smoothed version of same

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
import glob
import json

import numpy as np
import pandas as pd
import scipy.interpolate

import wbdata
import datetime

DAYS_PER_MONTH = 365.0/12.0

def coarsen(df, points, maintain_endpoints = True):
    nrows = max(df.count(0))
    stepsize = max(int(nrows/points),1)
    out = df.copy().iloc[::stepsize,:]
    
    # If we should maintain the endpoints, substitute the last regular point for the endpoint
    # (may be the same point if we're lucky)
    if maintain_endpoints:
        out = out[0:-1].append(df.tail(1))
    return out

def lag0(x, lags=1):
    return x.shift(lags).fillna(0)

def lorenz_to_cdf_cubic(L, mean, minimum, maximum, derive_bounds = False):
    dL = (L.L - lag0(L.L)) / (L.p - lag0(L.p))
    y = mean * dL
    p = (L.p + lag0(L.p))/2
    if minimum:
        y = np.hstack((minimum, y))
        p = np.hstack((0.0,p))
    if maximum:
        y = np.hstack((y, maximum))
        p = np.hstack((p,1.0))
    #print(np.vstack((y, p)))
    cdf = scipy.interpolate.CubicSpline(y, p, bc_type="clamped", extrapolate=0)#((2,0.0),(1, 0.0)), extrapolate=0)
    #cdf = scipy.interpolate.PchipInterpolator(y, p, extrapolate=False)
    minimum = mean * float(dL.head(1))
    maximum = mean * float(dL.tail(1))
    
    if derive_bounds:
        return cdf, (minimum, maximum)
    else:
        return cdf
    
def lorenz_cubic(L, minimum=0, maximum=1e4):
    Lc = scipy.interpolate.CubicSpline(
        np.hstack((0, L.p)),
        np.hstack((0, L.L)),
        bc_type=((2, 0),(2, 0)),
        extrapolate=0)
    return Lc


def inverse_of_scalar(f, x, a=1e-6, b=1-1e-6):
    try:
        return scipy.optimize.brentq(lambda y: f(y)-x, a, b)
    except ValueError:
        return float("NaN")

inverse_of = np.vectorize(inverse_of_scalar, excluded=('f','a','b'))

class IncomeDistribution(object):
    def __init__(self, raw_json_fn):
        with open(raw_json_fn) as f:
            self.data = json.load(f)
            
        self.cc3 = self.data['dataset']['iso3c']
        self.year = self.data['dataset']['year']
        self.mean_ppp_day = self.data['summary']['mean_month_ppp'] / DAYS_PER_MONTH
        self.min_ppp_day = self.data['sample']['month_min'] / DAYS_PER_MONTH if 'sample' in self.data else None
        self.max_ppp_day = self.data['sample']['month_max'] / DAYS_PER_MONTH if 'sample' in self.data else None
        self.lorenz = pd.DataFrame(self.data['lorenz'], index=None)
        self.init_interpolation()
        self.fetch_wdi_data()

    def init_interpolation(self):
        # Empirical CDF & PDF
        self.cdf, (self.min_ppp_day, self_max_ppp_day) = lorenz_to_cdf_cubic(self.lorenz, self.mean_ppp_day, self.min_ppp_day, self.max_ppp_day, True)
        self.pdf = self.cdf.derivative()
        
        # Smoothed (coarsened) CDF & PDF
        self.lorenz_smooth = coarsen(self.lorenz, 15)
        self.cdf_smooth = lorenz_to_cdf_cubic(self.lorenz_smooth, self.mean_ppp_day,  None, None)
        self.pdf_smooth = self.cdf_smooth.derivative()
        
    def fetch_wdi_data(self):
        self.population = float(wbdata.get_data(
            country = self.cc3,
            data_date = datetime.datetime(self.year, 1, 1),
            indicator = "SP.POP.TOTL"
        )[0]['value'])
        
        self.region_wb = wbdata.get_country(country_id = self.cc3, display=False)[0]['region']['id']

    def save_computed_json(self, out_json_fn):
        grid = np.arange(self.min_ppp_day, self.max_ppp_day, 0.1)
        gridlist = grid.tolist()
        
        out_json = {
            #'cdf': { 'x': self.cdf.x.tolist(), 'c': self.cdf.c.tolist() },
            'cdf_smooth': { 'x': self.cdf_smooth.x.tolist(), 'c': self.cdf_smooth.c.T.tolist() },
        }
        with open(out_json_fn, 'w') as f:
            json.dump(out_json, f)
            
    def pdf_smooth_semilog(self, y):
        return y*self.pdf_smooth(y)
    
    def pdf_semilog(self, y):
        return y*self.pdf(y)
    
class LorenzIncomeDistribution(IncomeDistribution):
    def __init__(self, raw_json_fn):
        super().__init__(raw_json_fn)
        
    def init_interpolation(self):
        rel_min = self.min_ppp_day / self.mean_ppp_day if self.min_ppp_day else None
        rel_max = self.max_ppp_day / self.mean_ppp_day if self.max_ppp_day else None
        
        # Empirical CDF & PDF
        self.Lc = lorenz_cubic(self.lorenz, rel_min, rel_max)
        self.cdf_inv = lambda p: self.mean_ppp_day * scipy.misc.derivative(self.Lc, p, dx=1e-6) 
        self.cdf = lambda y: inverse_of(self.cdf_inv, y)
        self.pdf = lambda y: scipy.misc.derivative(self.cdf, y, dx=1e-6)
        
        # Smoothed (coarsened) CDF & PDF
        self.lorenz_smooth = coarsen(self.lorenz, 15)
        self.Lc_smooth = lorenz_cubic(self.lorenz_smooth, rel_min, rel_max)
        self.cdf_inv_smooth = lambda p: self.mean_ppp_day * scipy.misc.derivative(self.Lc_smooth, p, dx=1e-6) 
        self.cdf_smooth = lambda y: inverse_of(self.cdf_inv_smooth, y)
        self.pdf_smooth = lambda y: scipy.misc.derivative(self.cdf_smooth, y, dx=1e-6)        
        
    def save_computed_json(self, out_json_fn):
        pass

In [7]:
d = dict()
for fn in glob.glob('../jsoncache/*.json'):
    #print(fn)
    try:
        survey = IncomeDistribution(fn)
        survey.save_computed_json('../computed/{}_{}.json'.format(survey.cc3, survey.year))
        if survey.cc3 not in d:
            d[survey.cc3] = dict()
        d[survey.cc3][survey.year] = survey
    except (ValueError, KeyError) as e:
        print(fn, e)


../jsoncache/AGO_2_2000.21.json `x` must be strictly increasing sequence.
../jsoncache/ARG_2_1992.json `x` must contain only finite values.
../jsoncache/ARG_2_1993.json `x` must contain only finite values.
../jsoncache/ARG_2_1994.json `x` must contain only finite values.
../jsoncache/ARG_2_1995.json `x` must contain only finite values.
../jsoncache/ARG_2_1996.json `x` must contain only finite values.
../jsoncache/ARG_2_1997.json `x` must contain only finite values.
../jsoncache/ARG_2_1998.json `x` must contain only finite values.
../jsoncache/ARG_2_1999.json `x` must contain only finite values.
../jsoncache/ARG_2_2000.json `x` must contain only finite values.
../jsoncache/ARG_2_2001.json `x` must contain only finite values.
../jsoncache/ARG_2_2002.json `x` must contain only finite values.
../jsoncache/ARG_2_2003.json `x` must contain only finite values.
../jsoncache/ARG_2_2007.json `x` must contain only finite values.
../jsoncache/AUS_3_1981.json `x` must be strictly increasing sequence.
../jsoncache/AUS_3_1985.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_1995.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_2001.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_2002.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_2003.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_2004.json `x` must be strictly increasing sequence.
../jsoncache/AZE_3_2005.json `x` must be strictly increasing sequence.
../jsoncache/BGD_3_1995.5.json `x` must be strictly increasing sequence.
../jsoncache/BGR_3_1995.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_1998.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_1999.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2000.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2001.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2002.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2004.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2010.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2013.json `x` must be strictly increasing sequence.
../jsoncache/BLR_3_2014.json `x` must be strictly increasing sequence.
../jsoncache/BOL_3_1997.json `x` must contain only finite values.
../jsoncache/BOL_3_2000.json `x` must contain only finite values.
../jsoncache/BRA_3_1995.json `x` must contain only finite values.
../jsoncache/BRA_3_2002.json `x` must contain only finite values.
../jsoncache/BRA_3_2003.json `x` must contain only finite values.
../jsoncache/BRA_3_2005.json `x` must contain only finite values.
../jsoncache/BRA_3_2008.json `x` must be strictly increasing sequence.
../jsoncache/BRA_3_2011.json `x` must be strictly increasing sequence.
../jsoncache/BRA_3_2013.json `x` must contain only finite values.
../jsoncache/BRA_3_2014.json `x` must contain only finite values.
../jsoncache/CAN_3_1981.json `x` must be strictly increasing sequence.
../jsoncache/CHL_3_1987.json `x` must be strictly increasing sequence.
../jsoncache/CHN_5_2008.json `x` must be strictly increasing sequence.
../jsoncache/CHN_5_2012.json `x` must be strictly increasing sequence.
../jsoncache/CIV_3_1985.08.json `x` must be strictly increasing sequence.
../jsoncache/CIV_3_1986.08.json `x` must be strictly increasing sequence.
../jsoncache/CIV_3_1987.17.json `x` must be strictly increasing sequence.
../jsoncache/CIV_3_1988.33.json `x` must be strictly increasing sequence.
../jsoncache/CIV_3_1998.json `x` must contain only finite values.
../jsoncache/CIV_3_2002.json `x` must contain only finite values.
../jsoncache/CIV_3_2008.json `x` must contain only finite values.
../jsoncache/COL_3_1992.json `x` must contain only finite values.
../jsoncache/COL_3_1996.json `x` must contain only finite values.
../jsoncache/COL_3_1999.json `x` must contain only finite values.
../jsoncache/COL_3_2000.json `x` must contain only finite values.
../jsoncache/COL_3_2001.json `x` must contain only finite values.
../jsoncache/CRI_3_1989.json `x` must contain only finite values.
../jsoncache/CRI_3_1990.json `x` must contain only finite values.
../jsoncache/CRI_3_1991.json `x` must contain only finite values.
../jsoncache/CRI_3_1992.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_1993.json `x` must contain only finite values.
../jsoncache/CRI_3_1994.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_1995.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_1996.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_1998.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_1999.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_2000.json `x` must be strictly increasing sequence.
../jsoncache/CRI_3_2001.json `x` must contain only finite values.
../jsoncache/DOM_3_1996.json `x` must be strictly increasing sequence.
../jsoncache/DOM_3_1997.json `x` must be strictly increasing sequence.
../jsoncache/ECU_2_1995.json `x` must contain only finite values.
../jsoncache/ECU_3_1998.json `x` must contain only finite values.
../jsoncache/ECU_3_2000.json `x` must contain only finite values.
../jsoncache/GHA_3_1991.73.json `x` must be strictly increasing sequence.
../jsoncache/GHA_3_1998.18.json `x` must be strictly increasing sequence.
../jsoncache/GIN_3_1994.08.json `x` must be strictly increasing sequence.
../jsoncache/GIN_3_2002.25.json `x` must be strictly increasing sequence.
../jsoncache/GNB_3_1993.json `x` must be strictly increasing sequence.
../jsoncache/GNB_3_2010.json `x` must contain only finite values.
../jsoncache/HND_3_1992.json `x` must be strictly increasing sequence.
../jsoncache/HND_3_1993.json `x` must contain only finite values.
../jsoncache/HND_3_1994.json `x` must be strictly increasing sequence.
../jsoncache/HND_3_1998.json `x` must contain only finite values.
../jsoncache/HND_3_1999.json `x` must contain only finite values.
../jsoncache/HND_3_2002.json `x` must contain only finite values.
../jsoncache/IDN_1_1993.json `x` must be strictly increasing sequence.
../jsoncache/IDN_2_2002.json `x` must be strictly increasing sequence.
../jsoncache/IDN_2_2003.json `x` must be strictly increasing sequence.
../jsoncache/IDN_5_1993.json `x` must be strictly increasing sequence.
../jsoncache/IDN_5_2013.json `x` must be strictly increasing sequence.
../jsoncache/IND_5_2011.5.json `x` must be strictly increasing sequence.
../jsoncache/ISR_3_1979.json `x` must be strictly increasing sequence.
../jsoncache/ISR_3_1986.json `x` must be strictly increasing sequence.
../jsoncache/ISR_3_1992.json `x` must be strictly increasing sequence.
../jsoncache/JAM_3_1990.json `x` must be strictly increasing sequence.
../jsoncache/JAM_3_1996.json `x` must be strictly increasing sequence.
../jsoncache/JAM_3_2004.json `x` must be strictly increasing sequence.
../jsoncache/KAZ_3_1996.json `x` must be strictly increasing sequence.
../jsoncache/KGZ_3_2000.json `x` must be strictly increasing sequence.
../jsoncache/KGZ_3_2001.json `x` must be strictly increasing sequence.
../jsoncache/KGZ_3_2002.json `x` must be strictly increasing sequence.
../jsoncache/KGZ_3_2003.json `x` must be strictly increasing sequence.
../jsoncache/KHM_3_2004.json `x` must be strictly increasing sequence.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-d501078949e6> in <module>()
      3     #print(fn)
      4     try:
----> 5         survey = IncomeDistribution(fn)
      6         survey.save_computed_json('../computed/{}_{}.json'.format(survey.cc3, survey.year))
      7         if survey.cc3 not in d:

<ipython-input-6-d8e16531a7f0> in __init__(self, raw_json_fn)
     75         self.lorenz = pd.DataFrame(self.data['lorenz'], index=None)
     76         self.init_interpolation()
---> 77         self.fetch_wdi_data()
     78 
     79     def init_interpolation(self):

<ipython-input-6-d8e16531a7f0> in fetch_wdi_data(self)
     91             country = self.cc3,
     92             data_date = datetime.datetime(self.year, 1, 1),
---> 93             indicator = "SP.POP.TOTL"
     94         )[0]['value'])
     95 

/usr/local/lib/python3.5/site-packages/wbdata/api.py in get_data(indicator, country, data_date, convert_date, pandas, column_name, keep_levels)
    165         else:
    166             args.append(("date", data_date.strftime("%Y")))
--> 167     data = fetcher.fetch(query_url, args)
    168     if convert_date:
    169         data = convert_dates_to_datetime(data)

/usr/local/lib/python3.5/site-packages/wbdata/fetcher.py in fetch(query_url, args, cached)
    166         if response is None:
    167             raise ValueError("Got no response")
--> 168         results.extend(response[1])
    169         this_page = response[0]['page']
    170         pages = response[0]['pages']

TypeError: 'NoneType' object is not iterable

In [ ]:
d

In [8]:
#country_years = [('USA', 2010), ('GBR', 2010), ('AUS', 2010), ('MEX', 2010)]
latest_country_years = {cc3: max(d[cc3].keys()) for cc3 in d}
latest_country_years


Out[8]:
{'AGO': 2008,
 'ALB': 2012,
 'ARG': 2014,
 'ARM': 2014,
 'AUS': 2010,
 'AUT': 2012,
 'AZE': 2008,
 'BDI': 2006,
 'BEL': 2012,
 'BEN': 2011,
 'BFA': 2014,
 'BGD': 2010,
 'BGR': 2012,
 'BIH': 2011,
 'BLR': 2012,
 'BLZ': 1999,
 'BOL': 2014,
 'BRA': 2012,
 'BTN': 2012,
 'BWA': 2009,
 'CAF': 2008,
 'CAN': 2010,
 'CHE': 2012,
 'CHL': 2013,
 'CHN': 2013,
 'CIV': 1995,
 'CMR': 2014,
 'COG': 2011,
 'COL': 2014,
 'COM': 2004,
 'CPV': 2007,
 'CRI': 2014,
 'CYP': 2012,
 'CZE': 2012,
 'DEU': 2011,
 'DJI': 2013,
 'DNK': 2012,
 'DOM': 2013,
 'ECU': 2014,
 'ESP': 2012,
 'EST': 2012,
 'ETH': 2010,
 'FIN': 2012,
 'FJI': 2008,
 'FRA': 2012,
 'FSM': 2013,
 'GAB': 2005,
 'GBR': 2012,
 'GEO': 2014,
 'GHA': 2005,
 'GIN': 2012,
 'GMB': 2003,
 'GNB': 2002,
 'GRC': 2012,
 'GTM': 2014,
 'GUY': 1998,
 'HND': 2014,
 'HRV': 2012,
 'HTI': 2012,
 'HUN': 2012,
 'IDN': 2014,
 'IND': 2011,
 'IRL': 2012,
 'IRN': 2013,
 'ISL': 2012,
 'ISR': 2010,
 'ITA': 2012,
 'JAM': 2002,
 'JPN': 2008,
 'KAZ': 2013,
 'KEN': 2005,
 'KGZ': 2014,
 'KHM': 2012,
 'KIR': 2006}

In [9]:
r = d['CHN'][2011]
pc = r.lorenz_smooth.p.as_matrix()
Lc = r.lorenz_smooth.L.as_matrix()
mean = r.mean_ppp_day

dL = np.diff(Lc)
dp = np.diff(pc)

pmid = np.hstack((0.0, np.add(pc[1:],pc[:-1])/2))

y = np.hstack((0.0, mean * dL/dp))
mean


Out[9]:
4.710805479452055

In [10]:
##########################################
plt.rcParams["figure.figsize"] = (12,4)
fig, ax = plt.subplots(1, 3)
##########################################

### F(x) = 1 - (x/xm)^a => solve for xm and a
p1 = pmid[-2]
y1 = y[-2]

p2 = pmid[-1]
y2 = y[-1]

#alpha = (np.log(1-p2) - np.log(1-p1)) / (np.log(y1) - np.log(y2))
#ym = np.exp(np.log(y2) + (1/alpha)*np.log(1-p2))
#yshift = 0

def pareto_cdf(y, ym, alpha, yshift):
    return 1 - (ym / (y - yshift))**alpha

def pareto_pdf(y, ym, alpha, yshift):
    return alpha * ym**alpha / (y - yshift)**(alpha+1)

def pareto_pdfderiv(y, ym, alpha, yshift):
    return -alpha*(alpha+1) * ym**alpha / (y - yshift)**(alpha+2)

def target(parameters):
    dd1, ym, alpha, yshift = parameters
    dd1 = dd1/100
    yshift = yshift*mean

    ccubpar_cdf = scipy.interpolate.CubicSpline(y[:-1], pmid[:-1], bc_type=((1, 0.0),(2,dd1)),extrapolate=0)
    ccubpar_pdf = ccubpar_cdf.derivative()

    err1 = pareto_cdf(y1, ym, alpha, yshift) - p1
    err1d = pareto_pdf(y1, ym, alpha, yshift) - ccubpar_pdf(y1)
    err1dd = pareto_pdfderiv(y1, ym, alpha, yshift) - ccubpar_pdf.derivative()(y1)
    #err1dd = 0
    err2 = pareto_cdf(y2, ym, alpha, yshift) - p2
    return err1**2 + (err1d*20)**2 + (err1dd*100)**2 + err2**2

from scipy.optimize import basinhopping
#guess = (-0.000015, 2.85446391657, 0.783499652638, 135.703869627)
guess = (0, 1, 1, 1)
dd1, ym, alpha, yshift = basinhopping(target, guess).x
dd1 = dd1/100
yshift = yshift*mean
print(dd1, ym, alpha, yshift)

ccubpar_cdf = scipy.interpolate.CubicSpline(y[:-1], pmid[:-1], bc_type=((1, 0.0),(2,dd1)),extrapolate=0)
ccubpar_pdf = ccubpar_cdf.derivative()
d1 = ccubpar_pdf(y1)
dd1 = ccubpar_pdf.derivative()(y1)

xgrid = np.linspace(0.0, 1.0, 1000)
ygrid = np.linspace(0.0, mean*10.0, 1000)

ax[0].plot(y, pmid, ".")
ax[0].plot(ygrid, ccubpar_cdf(ygrid), 'b-')
ax[1].plot(ygrid, ccubpar_pdf(ygrid), 'b-')

ax[0].plot(ygrid, pareto_cdf(ygrid, ym, alpha, yshift), 'r-')
ax[0].set_xlim((0,mean*10))
ax[0].set_ylim((0, 1.0))

ygrid2 = np.linspace(y1, mean*10, 1000)

ax[1].plot(ygrid2, pareto_pdf(ygrid2, ym, alpha, yshift), 'r-')

final_pdf = np.vectorize(lambda y: ccubpar_pdf(y) if y < y1 else pareto_pdf(y, ym, alpha, yshift))
ax[2].plot(ygrid, final_pdf(ygrid))
import scipy.integrate
print(scipy.integrate.quad(final_pdf, 35.15, 1e3))[0]


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:20: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: overflow encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: overflow encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:20: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: overflow encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: overflow encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:20: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:23: RuntimeWarning: overflow encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: invalid value encountered in double_scalars
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: overflow encountered in double_scalars
-0.00519586744446 0.244747657131 0.942765458413 5.7953720278
(0.010569458830270703, 1.3253497640277839e-09)
/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:20: RuntimeWarning: invalid value encountered in power
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-d50b515fb791> in <module>()
     72 ax[2].plot(ygrid, final_pdf(ygrid))
     73 import scipy.integrate
---> 74 print(scipy.integrate.quad(final_pdf, 35.15, 1e3))[0]

TypeError: 'NoneType' object is not subscriptable

In [11]:
dist = d['IND'][2011]
grid = np.arange(0,20,0.1)
#plt.plot(grid,dist.cdf(grid))
dist.cdf_inv(0.5)
#scipy.optimize.brentq(lambda x: dist.cdf_inv(x)-2.806, 1e3, 1-1e3)
inverse_of_scalar(dist.cdf_inv, 2.806)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-ffa7ec46c94e> in <module>()
      2 grid = np.arange(0,20,0.1)
      3 #plt.plot(grid,dist.cdf(grid))
----> 4 dist.cdf_inv(0.5)
      5 #scipy.optimize.brentq(lambda x: dist.cdf_inv(x)-2.806, 1e3, 1-1e3)
      6 inverse_of_scalar(dist.cdf_inv, 2.806)

AttributeError: 'IncomeDistribution' object has no attribute 'cdf_inv'

In [12]:
dist = d['IND'][2011]

grid = np.arange(0, 20, 0.2)

plt.rcParams["figure.figsize"] = (12,6)

plt.plot(grid,dist.pdf(grid),'g')
plt.plot(grid,dist.pdf_smooth(grid),'r')

plt.show()

import scipy.integrate
scipy.integrate.quad(dist.pdf, dist.min_ppp_day, dist.max_ppp_day)

#dist.lorenz_smooth

#plt.plot(dist.lorenz_smooth.p, dist.lorenz_smooth.L)
dist.max_ppp_day

dist.cdf_smooth(11)


/usr/local/lib/python3.5/site-packages/scipy/integrate/quadpack.py:356: IntegrationWarning: The maximum number of subdivisions (50) has been achieved.
  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  warnings.warn(msg, IntegrationWarning)
Out[12]:
array(0.944296669724061)

In [13]:
for cc3, year in latest_country_years.items():
    if d[cc3][year].pdf(100) < 0:
        print(cc3, year, d[cc3][year].pdf(100))


CAF 2008 -5.994596476442348e-05
FJI 2008 -6.8560715024541685e-06
FSM 2013 -0.0001841730560130262
BTN 2012 -3.6812498281650794e-06
ETH 2010 -8.346612455249826e-05
HUN 2012 -4.7518753590278445e-05
GAB 2005 -0.00011299676543753428
GTM 2014 -8.759303008168309e-05
AGO 2008 -0.00010838272552244157
DJI 2013 -4.558960465715769e-06

In [14]:
region_cols = {'EAS': 'red', 'ECS': 'blue', 'LCN': 'green', 'MEA': 'yellow', 'NAC': 'blue', 'SAS': 'orange', 'SSF': 'pink'}
region_sort = {'EAS': 7, 'ECS': 2, 'LCN': 4, 'MEA': 5, 'NAC': 1, 'SAS': 6, 'SSF': 3}
grid = np.arange(0,300,0.2)

stacks = sorted(latest_country_years.items(), key = lambda x: (region_sort[d[x[0]][x[1]].region_wb], d[x[0]][x[1]].population))
bigdf = np.row_stack([np.nan_to_num(d[cc3][year].population*d[cc3][year].pdf_smooth_semilog(grid)) for (cc3, year) in stacks])

ax = plt.subplot()
ax.set_xscale("log", nonposx='clip')
plt.xlim(0.1,300)
plt.stackplot(grid,bigdf,colors=[region_cols[d[cc3][year].region_wb] for (cc3, year) in stacks],edgecolor='none')
plt.axvline(1.90)
plt.axvline(20)
plt.show()



In [293]:
grid


Out[293]:
array([  0. ,   0.2,   0.4,   0.6,   0.8,   1. ,   1.2,   1.4,   1.6,
         1.8,   2. ,   2.2,   2.4,   2.6,   2.8,   3. ,   3.2,   3.4,
         3.6,   3.8,   4. ,   4.2,   4.4,   4.6,   4.8,   5. ,   5.2,
         5.4,   5.6,   5.8,   6. ,   6.2,   6.4,   6.6,   6.8,   7. ,
         7.2,   7.4,   7.6,   7.8,   8. ,   8.2,   8.4,   8.6,   8.8,
         9. ,   9.2,   9.4,   9.6,   9.8,  10. ,  10.2,  10.4,  10.6,
        10.8,  11. ,  11.2,  11.4,  11.6,  11.8,  12. ,  12.2,  12.4,
        12.6,  12.8,  13. ,  13.2,  13.4,  13.6,  13.8,  14. ,  14.2,
        14.4,  14.6,  14.8,  15. ,  15.2,  15.4,  15.6,  15.8,  16. ,
        16.2,  16.4,  16.6,  16.8,  17. ,  17.2,  17.4,  17.6,  17.8,
        18. ,  18.2,  18.4,  18.6,  18.8,  19. ,  19.2,  19.4,  19.6,  19.8])

Issues

  • It seems like BRA 2010 is not a well-formed Lorenz curve, around 57,58 it is non-convex
  • Some distributions are just noisy - e.g. GBR 2010 - these are messy to plot as distributions, better to plot as histograms?

When plotting a density on a semilog (x) chart, need to rescale y to preserve integral

The maths is relatively straightforward. Ordinarily we are plotting $f(x)$ against $x$, but now we are plotting against $y = log_{10}(x)$. Then $x = 10^y$ and so by a change of variables, we are plotting $f(10^y)$ against $y$ (but labelled with $x$). Then we can use change of variables to write the integral as

$$\int_a^b f(x)\,dx = \int_{\log_{10}(a)}^{\log_{10}(b)} (\log 10) 10^y f(10^y)\,dy = K \int_{\log_{10}(a)}^{\log_{10}(b)} x f(x)\,dy $$

so that we need to rescale the y-values by x in order to preserve areas on a semilog chart. With this transformation, the appearance of shapes (e.g. symmetry) is not preserved and the y-axis becomes hard to interpret.


In [ ]:
%matplotlib notebook
from scipy.stats import uniform

x_axis = np.arange(0, 20, 0.01)
y1 = uniform.pdf(x_axis,1,1)
y2 = uniform.pdf(x_axis,10,1)
plt.plot(x_axis, y1)
plt.plot(x_axis, y2)

In [ ]:
plt.semilogx(x_axis, (x_axis)*y1)
plt.semilogx(x_axis, (x_axis)*y2)

In [ ]:
wbdata.search_indicators("electricity")

In [ ]: